In [30]:
import pandas as pd
import re
import string

#change dataset path
df = pd.read_csv('0819_UkraineCombinedTweetsDeduped.csv')

In [31]:
df = df[df['language']=='en']
documents = df['text'].tolist()


In [32]:
def remove_links_content(text):
    text = re.sub(r"http\S+", "", text)
    return text


def remove_emails(text):
    return re.sub('\S*@\S*\s?', '', text)  # noqa


def remove_punctuation(text):
    """https://stackoverflow.com/a/37221663"""
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table)

def remove_multiple_space(text):
    return re.sub("\s\s+", " ", text)

In [39]:
!pip install cleantext

Collecting cleantext
  Downloading cleantext-1.1.4-py3-none-any.whl (4.9 kB)
Installing collected packages: cleantext
Successfully installed cleantext-1.1.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [40]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from cleantext import clean

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess documents
def preprocess(document):
    # Tokenize
    document = remove_links_content(document)
    document = remove_emails(document)
    document = remove_punctuation(document)
    document = remove_multiple_space(document)
    
    words = word_tokenize(document.lower())
    # Remove stopwords and punctuations
    filtered_words = [word for word in words if word.isalnum() and not word in stop_words]
    # Lemmatize
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return ' '.join(lemmatized_words)

# Preprocess all documents
preprocessed_documents = [preprocess(doc) for doc in documents]


[nltk_data] Downloading package punkt to /home/eingrid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eingrid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eingrid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim import matutils

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=4, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(preprocessed_documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()






In [61]:
model = NMF(n_components=5, init='random', random_state=0)
W = model.fit_transform(tfidf)
H = model.components_

In [62]:
# Extract the top words for each topic
n_top_words = 10
topics = []
for topic_idx, topic in enumerate(H):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features = [tfidf_feature_names[i] for i in top_features_ind]
    topics.append(top_features)


In [63]:
# Convert the list of top words into a list of lists of words
texts = [[word for word in doc.lower().split() if word in tfidf_feature_names] for doc in preprocessed_documents]

# Create a Gensim dictionary
dictionary = Dictionary(texts)

# Convert the dictionary and the corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Calculate the coherence score using Gensim
coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

print('Coherence Score:', coherence_score)

Coherence Score: 0.719928770541528


In [64]:
topics

[['ukraine',
  'russia',
  'russian',
  'war',
  'ukrainian',
  'russiaisaterroriststate',
  'ukrainerussiawar',
  'nuclear',
  'plant',
  'military'],
 ['china',
  'putin',
  'biden',
  'russia',
  'india',
  'usa',
  'news',
  'world',
  'sweden',
  'uk'],
 ['armukrainenow',
  'germany',
  'missile',
  'save',
  'airdefence',
  'speed',
  'militaryaid',
  'defense',
  'transfer',
  'norway'],
 ['standwithukraine',
  'playing',
  'video',
  'rock',
  'thank',
  'ukrainerussiawar',
  'slavaukraini',
  'ukraineunderattack',
  'russianwarcrimes',
  'radio'],
 ['hire',
  'passive',
  'awesome',
  'term',
  'income',
  'website',
  'create',
  'autopilot',
  'long',
  'affiliate']]

In [59]:
df['extractedts'].apply(lambda x : x.split(' ')[0]).value_counts()

extractedts
2022-08-19    17663
2022-08-20     5516
Name: count, dtype: int64