In [2]:
import pandas as pd
import re
import string
import numpy as np

from src.nnmf_pipeline import NNMFPipelineEnglish
from src.lsa_pipeline import LSAPipelineEnglish
#change dataset path
df = pd.read_csv('data/processed/en_tweets_processed.csv')
df = df[df.language == 'en']
df = df[~df['text'].isna()]
preprocessed_documents = df['text'].tolist()


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Selection of N_topics

In [4]:

n_components_range = [5,10,15,20,25,30,35,40,45,50,70,100]
scores = []
for n in n_components_range:
    NNMFPipeline =  NNMFPipelineEnglish(preprocessed_documents,ngram_range=(1,2),n_components=n,max_iter=1000,tf_idf_max_df=0.9,tf_idf_min_df=4,random_state=0)
    topics1 = NNMFPipeline.run_topics_detection()
    current_score = NNMFPipeline.calculate_coherence_score()
    print(f'Current score {current_score}, n_components = {n}')
    scores.append(current_score)

Current score 0.7048616049254075, n_components = 5
Current score 0.7546359202031336, n_components = 10
Current score 0.7169844864297766, n_components = 15
Current score 0.6905947006293516, n_components = 20
Current score 0.668590728970941, n_components = 25
Current score 0.6368263803499924, n_components = 30
Current score 0.6401762003585644, n_components = 35
Current score 0.6550259907441649, n_components = 40
Current score 0.6250037375938021, n_components = 45
Current score 0.6199270528298833, n_components = 50
Current score 0.599805520260647, n_components = 70
Current score 0.5922649824398365, n_components = 100


In [13]:
best_n_components = n_components_range[np.argmax(scores)]

In [None]:
NNMFPipeline =  NNMFPipelineEnglish(preprocessed_documents,ngram_range=(1,2),n_components=5,max_iter=1000,tf_idf_max_df=0.9,tf_idf_min_df=4,random_state=0)
topics1 = NNMFPipeline.run_topics_detection()
NNMFPipeline.calculate_coherence_score()

0.7048616049254075

In [None]:
lsa_pipeline = LSAPipelineEnglish(preprocessed_documents,ngram_range=(1,2),lsa_components=5,svd_n_iter=200,tf_idf_max_df=0.9,tf_idf_min_df=4,random_state=0)
topics2 = lsa_pipeline.run_topics_detection()
lsa_pipeline.calculate_coherence_score()

0.7562434732130454

## Topic Modeling per month (not all data used)

In [2]:
import os 
import pandas as pd
from src.nnmf_pipeline import NNMFPipelineEnglish

csv_files = [x for x in os.listdir('data/montly_data') if x.endswith('csv')]

In [3]:
import pandas as pd
import gc  # Garbage collection
from pathlib import Path

# Assuming `NNMFPipelineEnglish` is defined elsewhere and properly imported

def read_and_process_csv(file_path, usecols=['text'], subset='text'):
    """Read CSV file for specific columns, handle exceptions, and preprocess data."""
    try:
        df = pd.read_csv(file_path, usecols=usecols).drop_duplicates(subset=subset)
    except pd.errors.ParserError:
        df = pd.read_csv(file_path, lineterminator='\n', usecols=usecols).drop_duplicates(subset=subset)
    df.dropna(subset=[subset], inplace=True)
    return df['text']

def extract_topics(documents):
    """Wrapper for NMF topic extraction and coherence score calculation."""
    pipeline = NNMFPipelineEnglish(documents, ngram_range=(1, 2), n_components=10, max_iter=1000,
                                   tf_idf_max_df=0.9, tf_idf_min_df=4, random_state=0)
    topics = pipeline.run_topics_detection()
    score = pipeline.calculate_coherence_score()
    return topics, score

# Main processing loop
topics_per_file = {}
data_path = Path('data/montly_data')

for csv_file in data_path.glob('*.csv'):
    documents = read_and_process_csv(csv_file)
    if not documents.empty:
        topics, score = extract_topics(documents)
        date = '_'.join(csv_file.stem.split('_')[:2])
        print(date, score)
        topics_per_file[date] = {'cv_score': score, 'topics': topics}
        
    del documents  # Explicitly delete the variable to free up memory
    gc.collect()  # Manually trigger garbage collection


  df = pd.read_csv(file_path, usecols=usecols).drop_duplicates(subset=subset)


2023_04 0.8345838584199361


  df = pd.read_csv(file_path, usecols=usecols).drop_duplicates(subset=subset)


2022_08 0.7097407808017794


  df = pd.read_csv(file_path, usecols=usecols).drop_duplicates(subset=subset)


2023_01 0.7420393781816056


  df = pd.read_csv(file_path, usecols=usecols).drop_duplicates(subset=subset)


2023_02 0.7592571760375599
2022_12 0.8683631982102987
2022_11 0.8020437425581433
2022_09 0.747827180109221


  df = pd.read_csv(file_path, usecols=usecols).drop_duplicates(subset=subset)


2023_03 0.776088679856147
2023_05 0.8019473179719212
2022_10 0.7316282690297533


In [5]:
import json
file_path = 'topics_per_file.json'

with open(file_path, 'w') as json_file:
    json.dump(topics_per_file, json_file, indent=4)

In [11]:
topics_per_file['2022_08']

{'cv_score': 0.7097407808017794,
 'topics': [['ukraine',
   'glory',
   'support',
   'support ukraine',
   'glory ukraine',
   'day ukraine',
   'stand',
   'ukraine independence',
   'russia',
   'help'],
  ['morning',
   'good',
   'good morning',
   'twice',
   'coming',
   'twice coming',
   'love',
   'night',
   'good night',
   'day'],
  ['russia',
   'putin',
   'state',
   'russia defeated',
   'possible russia',
   'pariah state',
   'peace possible',
   'russia pariah',
   'russian barbarism',
   'barbarism limit'],
  ['thank',
   'support',
   'thank support',
   'need',
   'support need',
   'ukraine thank',
   'need 20',
   'freedom',
   'thank fighting',
   'independence freedom'],
  ['day',
   'independence',
   'independence day',
   'happy',
   'happy independence',
   'day ukraine',
   'ukraine independence',
   'today',
   'ukrainian',
   'ukrainian independence'],
  ['analysis',
   'im',
   'bot',
   'im bot',
   'analysis article',
   'content analysis',
   'arti

In [None]:
pd.read_csv('data/processed/en_tweets_processed.csv')

In [None]:
0.7048616049254076