## Extraction of data from [timeline of events](https://en.wikipedia.org/wiki/Timeline_of_the_21st_century)

Need to download the xml file on this [link](https://en.wikipedia.org/wiki/Special:Export/Timeline_of_the_21st_century)

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
from bs4 import BeautifulSoup
import re
import os
import string
import random

import nltk

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download('punkt')

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arbenmiftari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/arbenmiftari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [93]:
path = '../datasets/wiki_events.xml'
with open(path, 'r') as f:
    data = f.read()

Bs_data = BeautifulSoup(data, "xml")

In [94]:
plain_txt = Bs_data.text
idx = plain_txt.find('===[[2005]]===')
text = plain_txt[idx:]

In [95]:
text = text.replace('\n\n', '\n')
text = text.replace('[', '')
text = text.replace(']', '')

In [96]:
split_txt = text.split('\n')

In [97]:
copy_txt = split_txt.copy()
new_list=[]
for i,txt in enumerate(copy_txt):
    if 'see' in txt:
        continue
    if 'See' in txt:
        break
    if  '2020s' in txt:
        continue
    if  '2010s' in txt:
        continue
    if '--' in txt:
        continue
    if '===' not in txt:
        new_list.append(txt[2:])
    else:
        new_list.append(txt)

In [99]:
list_months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

In [103]:
list_tuples = []
for i,element in enumerate(new_list):
    month = ''
    if '===' in element:
        year = element[3:7]
        continue
    if any(month in element for month in list_months):
        idx = element.find(':')
        month = element[:idx]
        element=element[idx+1:]
    list_tuples.append((year,month, element))

In [104]:
events_df = pd.DataFrame(list_tuples, columns =['Year','Day', 'Event'])

In [105]:
events_df.to_csv('../datasets/wiki_events.csv')

# Data exploration to determine controversial videos

In [None]:
#Videos informations and state (num of likes/views) 
data_metadata_path = "../data/yt_metadata_helper.feather"
df_metadata = pd.read_feather(data_metadata_path)
df_metadata['dummy']=1
df_metadata.head(5)

In [None]:
def check_nan(df):
    check = False
    for col in df.columns:
        if df[col].isnull().values.any():
            print(col)
            check = True
    if not check:
            print('None')
check_nan(df_metadata)

In [None]:
df_metadata.fillna(0, inplace=True)
df_metadata.sample(10)

In [None]:
df_cat_count = df_metadata.copy()
df_cat_count = df_cat_count.groupby("categories", as_index=False).sum()
df_cat_count.drop(labels=[0,8,15], axis=0, inplace=True)

In [None]:
df_cat_count

In [None]:
df_cat_count.plot.bar(x='categories', y=['like_count', 'dislike_count'], logy=True)

In [None]:
df_cat_count['ratio']=df_cat_count['dislike_count']/(df_cat_count['like_count']+df_cat_count['dislike_count'])
df_cat_count[['categories', 'ratio']].sort_values(by=['ratio'])
df_cat_count

In [None]:
df_cat_count.plot.bar(x='categories', y='ratio')

In [None]:
df = df_metadata.copy()
df['ratio']=df['dislike_count']/(df['like_count']+df['dislike_count'])

In [None]:
df['ratio'].fillna(0, inplace=True)
df['ratio'].isnull().unique()

In [None]:
fig, axs = plt.subplots(5,3, figsize=(16,8), sharey=True)

for cat,ax in zip(df['categories'].unique(), axs.ravel()):
    df[df.categories==cat].hist(column='ratio', bins=20 ,ax=ax, log=True)
    ax.set_title(cat)
    ax.set_xlabel('ratio')
    ax.set_ylabel('count')
plt.tight_layout()

## Exploration of the content for controversial videos

In [None]:
path = '../datasets/metadata_chunks/'
csv_files = glob.glob(path+'*.csv.gz')
df0 = pd.read_csv(csv_files[50], usecols=['categories', 'description', 'tags', 'title', 'dislike_count', 'like_count'])
df0.head(5)

In [None]:
#Data preprocessing: removing unuseful columns, keep videos with at least 50 likes and dislikes
#Keep the columns in which we are interested in
df_raw = df0.drop_duplicates(subset=['title'])
df_raw["video_info"] = df_raw['title'].astype(str) +": "+ df_raw["description"]
df_raw = df_raw.drop(['title'],  axis=1)
df_raw = df_raw.drop(['description'], axis=1)
df_raw = df_raw[df_raw.categories=='News & Politics']
df_raw = df_raw[df_raw.dislike_count+df_raw.like_count > 25]
df_raw['ratio']=df_raw['dislike_count']/(df_raw['like_count']+df_raw['dislike_count'])
df_raw['ratio'].fillna(0, inplace=True)
df_raw = df_raw.drop(['dislike_count', 'like_count', 'categories'],  axis=1)
df_raw.sample(5)

In [None]:
# keep only videos whose ratio is higher than than a certain quantile
quantile = df_raw['ratio'].quantile(0.33)
print(quantile)
df_raw = df_raw[df_raw.ratio > quantile]
df_raw.sample(5)

In [None]:
def clean_text(text, tokenizer, stopwords):
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [None]:
# preprocessing our dataframe to be clustered
custom_stopwords = set(stopwords.words("english"))

df = df_raw.copy()

# we need to tokenize our texts.
df["text"] = df_raw['video_info']
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

docs = df["text"].values
tokenized_docs = df["tokens"].values

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

In [None]:
# transform our sentences into numerical vectors
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=SEED)

In [None]:
# Generate vectors for list of documents using a Word Embedding

def vectorize(list_of_docs, model):
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features
    
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

In [None]:
# Generate clusters and print Silhouette metrics using MBKmeans
def mbkmeans_clusters(X, k, mb, print_silhouette_values):
    """
    Params:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [None]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=200,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

In [None]:
print("Most representative terms per cluster (based on centroids):")
for i in range(200):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")