In [None]:
import ast

import pandas as pd
import plotly.graph_objects as go

DATA_FOLDER = '../data/Cleantech Media Dataset'

In [None]:
df = pd.read_csv(f'{DATA_FOLDER}/cleantech_media_dataset_v2_2024-02-23.csv')

In [None]:
df.head()

In [None]:
columns = df.columns
total_counts = df.count()
nan_counts = df.isna().sum()
unique_counts = df.nunique()

nan_percentages = (nan_counts / len(df)) * 100
unique_percentages = (unique_counts / len(df)) * 100

data = {
    'Total Count': total_counts,
    'NaN Count': nan_counts,
    'NaN Percentage (%)': nan_percentages,
    'Unique Count': unique_counts,
    'Unique Percentage (%)': unique_percentages
}
summary_df = pd.DataFrame(data, index=columns)

summary_df

In [None]:
domain_freq = df['domain'].value_counts()
domain_freq = domain_freq.reset_index()
domain_freq.columns = ['domain', 'count']

fig = go.Figure()
fig.add_trace(go.Bar(x=domain_freq['domain'], y=domain_freq['count']))

fig.update_layout(
    title='Frequency of Publishers in Cleantech',
    xaxis_title='Domain',
    yaxis_title='Frequency'
)

fig.show()

### Taking a closer look at titles
As the summary has shown, only `9569` of the `9593` scraped resources in the dataset have a unique title. This subsection explores if these "duplicate titles" have an underlying error or if these occurences of duplicates can be ignored.

In [None]:
title_freq = df['title'].value_counts()
title_freq = title_freq[title_freq > 1]
title_freq = title_freq.reset_index()
title_freq.columns = ['title', 'count']

title_freq

Now, lets take a closer look at the contents of the suspected duplicate documents.

In [None]:
def calculate_all_duplicate_document_contents(df, title_freq):
    duplicates_counts = {}

    for title in title_freq['title']:
        duplicate_contents = df[df['title'] == title]['content']
        duplicate_contents = duplicate_contents.apply(ast.literal_eval)
        duplicate_contents = duplicate_contents.explode()
        duplicates_count = duplicate_contents.duplicated().sum()
        duplicates_counts[title] = duplicates_count

    return pd.DataFrame(list(duplicates_counts.items()), columns=['title', 'duplicated_count'])


duplicated_title_contents = calculate_all_duplicate_document_contents(df, title_freq)

duplicated_title_contents

The function yielded that the duplicate observations of title contain actual duplicate information on chunk-basis.

This could mean that there are even more duplicate chunks under titles that aren't duplicate, so lets next look at that:

In [None]:
df['content'] = df['content'].apply(ast.literal_eval)
df_exploded_contents = df.explode('content')

In [None]:
print(f'Total duplicated contents: {df_exploded_contents.duplicated().sum()}'
      f'\nTotal duplicated contents from duplicated titles: {duplicated_title_contents["duplicated_count"].sum()}'
      f'\nTotal duplicated contents from non-duplicated titles: {df_exploded_contents.duplicated().sum() - duplicated_title_contents["duplicated_count"].sum()}')

As the result shows, another `629` chunks on top of the `264` duplicates inside the duplicate-title-occurences emerged.

### Analyzing Languages

In [None]:
from langdetect import detect, LangDetectException
import numpy as np


def safe_detect(text):
    text = str(text)
    try:
        return detect(text)
    except LangDetectException:
        return np.nan


df['language'] = df['content'].apply(safe_detect)

In [None]:
df['language'].value_counts()

In [None]:
df.head()

In [None]:
lang_anomlaies = df[(df['language'] == 'ru') | (df['language'] == 'de')]['Unnamed: 0'].to_list()
lang_anomlaies

In [None]:
non_english_chunks = []

for lang_anomaly in lang_anomlaies:
    content = df[df['Unnamed: 0'] == lang_anomaly]['content']
    for chunk in range(len(content.values[0])):
        if detect(content.values[0][chunk]) != 'en':
            non_english_chunks.append((lang_anomaly, chunk))

non_english_chunks

In [None]:
from src.preprocessing import Preprocessor

pp = Preprocessor(dataframe=df, verbose=True)

In [None]:
pp.preprocess()

In [None]:
pp.df.head()