In [1]:
import re
import pandas as pd

merged_df = pd.read_csv("merged_df.csv")
# Function to clean text
def clean_text(text):
    # Remove links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove @ mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtag symbols (but keep the text following it)
    text = re.sub(r'#', '', text)
    # Remove emojis
    text = re.sub(r'[^\w\s,]', '', text)
    return text

# Apply the function to the 'text' column
merged_df['text'] = merged_df['text'].apply(clean_text)


In [2]:
merged_df

Unnamed: 0.1,Unnamed: 0,text
0,0,RT UPDATE Death toll from Irans quake rises t...
1,1,RT We pray for all those affected by the eart...
2,2,RT JUST IN Death toll reaches 328 in Iran ear...
3,3,RT A magnitude 72 earthquake struck northern ...
4,7,Video 72magnitude earthquake jolts IranIraq bo...
...,...,...
36476,38749,Massive thanks to you and the entire team fo...
36477,38750,"PLEASE donate to AHBAP charity, the only chari..."
36478,38751,"I thought my heart would burst from grief, tea..."
36479,38752,Alhamdulilllah they are safe and sound ...


In [5]:
merged_df.to_csv('clean_df.csv')

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re

# Assuming you have a DataFrame with a column 'text' containing the tweets
tweets_df = pd.DataFrame({'text': merged_df['text']})  # Replace merged_df with your actual DataFrame

# Preprocess the tweets by removing links, mentions, and hashtags
def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\S+", "", text)  # Remove @ mentions
    text = re.sub(r"#", "", text)  # Remove hashtag symbol
    return text

tweets_df['text'] = tweets_df['text'].apply(preprocess_text)

# Vectorize the text to find the most common words
vectorizer = CountVectorizer(stop_words='english', max_features=1000)  # Adjust features as needed
X = vectorizer.fit_transform(tweets_df['text'])

# Get the most common keywords
keywords = vectorizer.get_feature_names_out()
print("Top keywords:", keywords)


Top keywords: ['000' '10' '100' '11' '12' '13' '14' '15' '16' '17' '18' '19' '20' '200'
 '2023' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30' '300' '35' '36'
 '37' '3rd' '40' '400' '45' '48' '50' '500' '5000' '54' '60' '600' '700'
 '72' '75' '76' '77' '78' '78magnitude' '800' 'able' 'absolutely' 'accept'
 'access' 'according' 'account' 'accounts' 'act' 'action' 'adana'
 'address' 'addresses' 'adele' 'adiyaman' 'aerial' 'afad' 'affected'
 'aftermath' 'aftershock' 'aftershocks' 'agency' 'ago' 'ahbap' 'aid' 'air'
 'aircraft' 'airport' 'airspace' 'alert' 'alive' 'allah' 'almighty'
 'amazing' 'ambassador' 'ameen' 'amen' 'amid' 'amin' 'amp' 'animals'
 'ankara' 'announced' 'antakya' 'aoty' 'apartment' 'appeal' 'appreciate'
 'appreciated' 'area' 'areas' 'army' 'arrive' 'arrived' 'ask' 'asked'
 'asking' 'assist' 'assistance' 'atsu' 'attention' 'authorities'
 'available' 'awareness' 'away' 'babies' 'baby' 'bad' 'bank' 'batch'
 'beautiful' 'behavior' 'believe' 'best' 'better' 'beyoncé' 'big'


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TfidfVectorizer to extract keywords with the highest tf-idf scores
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(tweets_df['text'])

# Get keywords and their scores
keywords = tfidf_vectorizer.get_feature_names_out()
print("Top TF-IDF keywords:", keywords)


Top TF-IDF keywords: ['000' '10' '100' '11' '12' '13' '14' '15' '16' '17' '18' '19' '20' '200'
 '2023' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30' '300' '35' '36'
 '37' '3rd' '40' '400' '45' '48' '50' '500' '5000' '54' '60' '600' '700'
 '72' '75' '76' '77' '78' '78magnitude' '800' 'able' 'absolutely' 'accept'
 'access' 'according' 'account' 'accounts' 'act' 'action' 'adana'
 'address' 'addresses' 'adele' 'adiyaman' 'aerial' 'afad' 'affected'
 'aftermath' 'aftershock' 'aftershocks' 'agency' 'ago' 'ahbap' 'aid' 'air'
 'aircraft' 'airport' 'airspace' 'alert' 'alive' 'allah' 'almighty'
 'amazing' 'ambassador' 'ameen' 'amen' 'amid' 'amin' 'amp' 'animals'
 'ankara' 'announced' 'antakya' 'aoty' 'apartment' 'appeal' 'appreciate'
 'appreciated' 'area' 'areas' 'army' 'arrive' 'arrived' 'ask' 'asked'
 'asking' 'assist' 'assistance' 'atsu' 'attention' 'authorities'
 'available' 'awareness' 'away' 'babies' 'baby' 'bad' 'bank' 'batch'
 'beautiful' 'behavior' 'believe' 'best' 'better' 'beyoncé'