# Analysis of the next browsing topic

In [89]:
import sqlite3
import pandas as pd
import argparse
from datetime import datetime, timedelta

def parse_args():
    parser = argparse.ArgumentParser(description="Path to the history file")
    parser.add_argument("--path", default="/home/doms/snap/firefox/common/.mozilla/firefox/g93bal4w.default", type=str, help="Path to the history file")
    args = parser.parse_args()
    return args.__dict__ 

def process(path: str) -> pd.DataFrame:
    SQL = """select origin, last_access_time from origin"""
    con = sqlite3.connect(f"{path}/storage.sqlite")
    cur = con.cursor()
    result = cur.execute(SQL)
    df = pd.DataFrame([], columns=["url", "last_visit_time"])
    for row in result.fetchall():
        d = dict(list(zip(df.columns, row)))
        d = {k: [v] for k,v in d.items()}
        df_new = pd.DataFrame(d, columns=df.columns)
        df = pd.concat([df, df_new])
    df = df.reset_index()[["url", "last_visit_time"]]
    df = df.sort_values("last_visit_time", ascending=False)
    df["last_visit_time"] = df["last_visit_time"].apply(lambda d: datetime(1970, 1, 1, 2, 0, 0) + timedelta(microseconds = d))
    return df 

df = process("/home/doms/snap/firefox/common/.mozilla/firefox/g93bal4w.default")
df

Unnamed: 0,url,last_visit_time
93,https://www.perplexity.ai,2024-10-17 14:21:26.630065
519,https://www.google.com,2024-10-17 14:21:26.626737
624,https://www.shutterstock.com,2024-10-17 14:21:26.623520
549,https://www.dataexpert.io,2024-10-17 14:21:26.620636
220,moz-extension://edabc34d-306d-43f9-988e-73a81f...,2024-10-17 14:21:26.607129
...,...,...
718,https://www.legalzoom.com,2024-08-12 19:08:01.919563
1318,https://accounts.firefox.com,2024-08-12 18:12:09.746082
985,https://www.netherlandsworldwide.nl,2024-08-12 18:11:14.715288
611,https://www.google.com^partitionKey=%28https%2...,2024-08-12 15:27:56.905986


In [235]:
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
import langdetect


def extract_desc(link):
    try:
        html = requests.get(link).text
        soup = BeautifulSoup(html, 'html.parser')
        metas = soup.find_all("meta")
        content = None
        for meta in metas:
            if meta.attrs.get("name") == "description":
                content = meta.attrs["content"]
    except Exception:
        return None
    return content

def get_lang(sentence):
    try:
        if sentence == None:
            return None
        ds = langdetect.detect_langs(sentence)
        M, arg_max = 0, -1
        for i, d in enumerate(ds):
            if d.prob > M:
                M = d.prob
                arg_max = i
        return ds[arg_max].lang
    except Exception:
        return None 

sample = df.iloc[0:20]
sample.apply(lambda r : extract_desc(r))
sample["desc"] = sample["url"].apply(extract_desc)
sample["lang"] = sample["desc"].apply(get_lang)
sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["desc"] = sample["url"].apply(extract_desc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample["lang"] = sample["desc"].apply(get_lang)


Unnamed: 0,url,last_visit_time,lang,desc
93,https://www.perplexity.ai,2024-10-17 14:21:26.630065,,
519,https://www.google.com,2024-10-17 14:21:26.626737,,
624,https://www.shutterstock.com,2024-10-17 14:21:26.623520,en,Download the best royalty free images from Shu...
549,https://www.dataexpert.io,2024-10-17 14:21:26.620636,en,Join the ultimate data engineering academy to ...
220,moz-extension://edabc34d-306d-43f9-988e-73a81f...,2024-10-17 14:21:26.607129,,
390,moz-extension://edabc34d-306d-43f9-988e-73a81f...,2024-10-17 14:21:26.606137,,
511,https://accounts.google.com,2024-10-17 14:21:26.500744,,
69,https://www.whatsapp.com,2024-10-17 14:15:48.421649,en,Use WhatsApp Messenger to stay in touch with f...
1993,https://stackoverflow.com,2024-10-17 14:15:34.991592,en,"Stack Overflow is the largest, most trusted on..."
1749,https://www.njuskalo.hr,2024-10-17 14:13:25.443272,hr,Oglasnik sa više od 500.000 posjeta dnevno i v...


NameError: name 'x' is not defined

In [151]:
desc = sample.apply(lambda x : extract_desc(x))

In [160]:
desc.apply(lambda s : get_lang(s))

LangDetectException: No features in text.

[en:0.999997159440808]