In [1]:
import pandas as pd
import plotly.graph_objects as go
import ast

DATA_FOLDER = './data/Cleantech Media Dataset'

In [2]:
df = pd.read_csv(f'{DATA_FOLDER}/cleantech_media_dataset_v2_2024-02-23.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,1280,Qatar to Slash Emissions as LNG Expansion Adva...,2021-01-13,,"[""Qatar Petroleum ( QP) is targeting aggressiv...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
1,1281,India Launches Its First 700 MW PHWR,2021-01-15,,"[""• Nuclear Power Corp. of India Ltd. ( NPCIL)...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
2,1283,New Chapter for US-China Energy Trade,2021-01-20,,"[""New US President Joe Biden took office this ...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
3,1284,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,,"[""The slow pace of Japanese reactor restarts c...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
4,1285,NYC Pension Funds to Divest Fossil Fuel Shares,2021-01-25,,"[""Two of New York City's largest pension funds...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...


In [4]:
columns = df.columns
total_counts = df.count()
nan_counts = df.isna().sum()
unique_counts = df.nunique()

nan_percentages = (nan_counts / len(df)) * 100
unique_percentages = (unique_counts / len(df)) * 100

data = {
    'Total Count': total_counts,
    'NaN Count': nan_counts,
    'NaN Percentage (%)': nan_percentages,
    'Unique Count': unique_counts,
    'Unique Percentage (%)': unique_percentages
}
summary_df = pd.DataFrame(data, index=columns)

summary_df

Unnamed: 0,Total Count,NaN Count,NaN Percentage (%),Unique Count,Unique Percentage (%)
Unnamed: 0,9593,0,0.0,9593,100.0
title,9593,0,0.0,9569,99.749818
date,9593,0,0.0,967,10.080267
author,31,9562,99.676848,7,0.07297
content,9593,0,0.0,9588,99.947879
domain,9593,0,0.0,19,0.198061
url,9593,0,0.0,9593,100.0


In [5]:
domain_freq = df['domain'].value_counts()
domain_freq = domain_freq.reset_index()
domain_freq.columns = ['domain', 'count']

fig = go.Figure()
fig.add_trace(go.Bar(x=domain_freq['domain'], y=domain_freq['count']))

fig.update_layout(
    title='Frequency of Publishers in Cleantech',
    xaxis_title='Domain',
    yaxis_title='Frequency'
)

fig.show()

### Taking a closer look at titles
As the summary has shown, only `9569` of the `9593` scraped resources in the dataset have a unique title. This subsection explores if these "duplicate titles" have an underlying error or if these occurences of duplicates can be ignored.

In [6]:
title_freq = df['title'].value_counts()
title_freq = title_freq[title_freq > 1]
title_freq = title_freq.reset_index()
title_freq.columns = ['title', 'count']

title_freq

Unnamed: 0,title,count
0,Cleantech Thought Leaders Series,5
1,About David J. Cross,5
2,Cleantech Insights from Industry Series,4
3,"Truss, Johnson Join Rebellion Against Sunak fo...",2
4,Staggering potential in next-gen building bloc...,2
5,Royal Opera House drops BP as sponsor after 33...,2
6,Macquarie targets North Sea as the green energ...,2
7,BEIS mulls ringfenced CfD support for geotherm...,2
8,Five ways for the energy industry to shift the...,2
9,Key trends in UK renewables and what to expect...,2


Now, lets take a closer look at the contents of the suspected duplicate documents.

In [7]:
def calculate_all_duplicate_document_contents(df, title_freq):
    duplicates_counts = {}
    
    for title in title_freq['title']:
        duplicate_contents = df[df['title'] == title]['content']
        duplicate_contents = duplicate_contents.apply(ast.literal_eval)
        duplicate_contents = duplicate_contents.explode()
        duplicates_count = duplicate_contents.duplicated().sum()
        duplicates_counts[title] = duplicates_count
    
    # return a pandas dataframe with the title and the count of duplicated contents
    return pd.DataFrame(list(duplicates_counts.items()), columns=['title', 'duplicated_count'])

duplicated_title_contents = calculate_all_duplicate_document_contents(df, title_freq)

duplicated_title_contents

Unnamed: 0,title,duplicated_count
0,Cleantech Thought Leaders Series,30
1,About David J. Cross,48
2,Cleantech Insights from Industry Series,21
3,"Truss, Johnson Join Rebellion Against Sunak fo...",9
4,Staggering potential in next-gen building bloc...,39
5,Royal Opera House drops BP as sponsor after 33...,10
6,Macquarie targets North Sea as the green energ...,12
7,BEIS mulls ringfenced CfD support for geotherm...,17
8,Five ways for the energy industry to shift the...,15
9,Key trends in UK renewables and what to expect...,10


The function yielded that the duplicate observations of title contain actual duplicate information on chunk-basis.

This could mean that there are even more duplicate chunks under titles that aren't duplicate, so lets next look at that:

In [8]:
df['content'] = df['content'].apply(ast.literal_eval)
df_exploded_contents = df.explode('content')

In [9]:
print(f'Total duplicated contents: {df_exploded_contents.duplicated().sum()}'
      f'\nTotal duplicated contents from duplicated titles: {duplicated_title_contents["duplicated_count"].sum()}'
      f'\nTotal duplicated contents from non-duplicated titles: {df_exploded_contents.duplicated().sum()-duplicated_title_contents["duplicated_count"].sum()}')

Total duplicated contents: 893
Total duplicated contents from duplicated titles: 264
Total duplicated contents from non-duplicated titles: 629


As the result shows, another `629` chunks on top of the `264` duplicates inside the duplicate-title-occurences emerged.

### Analyzing Languages

In [13]:
df

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,1280,Qatar to Slash Emissions as LNG Expansion Adva...,2021-01-13,,[Qatar Petroleum ( QP) is targeting aggressive...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
1,1281,India Launches Its First 700 MW PHWR,2021-01-15,,[• Nuclear Power Corp. of India Ltd. ( NPCIL) ...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
2,1283,New Chapter for US-China Energy Trade,2021-01-20,,[New US President Joe Biden took office this w...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
3,1284,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,,[The slow pace of Japanese reactor restarts co...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
4,1285,NYC Pension Funds to Divest Fossil Fuel Shares,2021-01-25,,[Two of New York City's largest pension funds ...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
...,...,...,...,...,...,...,...
9588,81812,Strata Clean Energy Nets $ 300 Million in Fund...,2023-11-06,,[Strata Clean Energy has closed a $ 300 millio...,solarindustrymag,https://solarindustrymag.com/strata-clean-ener...
9589,81813,Orsted Deploying SparkCognition Renewable Suit...,2023-11-07,,[Global renewable energy developer Ørsted is d...,solarindustrymag,https://solarindustrymag.com/orsted-deploying-...
9590,81814,Veolia Has Plans for 5 MW of Solar in Arkansas,2023-11-07,,"[Veolia North America, a provider of environme...",solarindustrymag,https://solarindustrymag.com/veolia-has-plans-...
9591,81815,"SunEdison: Too Big, Too Fast?",2023-11-08,,[Once the self-proclaimed “ leading renewable ...,solarindustrymag,http://www.solarindustrymag.com/online/issues/...


In [14]:
from langdetect import detect, LangDetectException
import numpy as np

def safe_detect(text):
    text = str(text)
    try:
        return detect(text)
    except LangDetectException:
        return np.nan

df['language'] = df['content'].apply(safe_detect)

In [17]:
df['language'].value_counts()

language
en    9589
de       3
ru       1
Name: count, dtype: int64