In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Sentiment Analysis
from textblob import TextBlob

# NLP
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Word Cloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Machine Learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression

# Other
import os
import re
import datetime
import string
import warnings
warnings.filterwarnings("ignore")

In [43]:
#  Importing the dataset
dataset = pd.read_csv('tweets.csv', encoding='utf-8', header='infer')
# Print the first 5 rows of the dataframe
print(dataset.head())

# Print the shape of the dataframe
print(dataset.shape)
# Print information about the dataframe
print(dataset.info())


                        date  \
0  2023-02-21 03:30:04+00:00   
1  2023-02-21 03:29:07+00:00   
2  2023-02-21 03:29:04+00:00   
3  2023-02-21 03:28:06+00:00   
4  2023-02-21 03:27:38+00:00   

                                             content  \
0  तुर्की में सोमवार देर रात भूंकप के तेज झटके मह...   
1  New search &amp; rescue work is in progress in...   
2  Can't imagine those who still haven't recovere...   
3  its a highkey sign for all of us to ponder ove...   
4  Turkiye Earthquake: तुर्किए में फिर आया भूकंप ...   

                                            hashtags  like_count  rt_count  \
0  ['ATDigital', 'Turkey', 'Earthquake', 'TurkeyE...         0.0       0.0   
1  ['Hatay', 'earthquakes', 'Türkiye', 'TurkiyeQu...         1.0       0.0   
2  ['Turkey', 'earthquake', 'turkeyearthquake2023...         0.0       0.0   
3    ['turkeyearthquake2023', 'earthquake', 'Syria']         0.0       0.0   
4  ['turkey', 'earthquake', 'turkiye', 'india', '...         0.0       0.0   

 

In [44]:
dataset_en = dataset[dataset['language'] == 'en'].copy()

# Print the shape of the dataframe with only english tweets (189626, 11)
print(dataset_en.shape)

# Transform the date column to datetime format
dataset_en['date'] = pd.to_datetime(dataset_en['date'])

# Transform the format of the date column
dataset_en['date'] = dataset_en['date'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Print the first 5 rows of the dataframe with only english tweets
print(dataset_en.head())

(189626, 11)
                  date                                            content  \
1  2023-02-21 03:29:07  New search &amp; rescue work is in progress in...   
2  2023-02-21 03:29:04  Can't imagine those who still haven't recovere...   
3  2023-02-21 03:28:06  its a highkey sign for all of us to ponder ove...   
5  2023-02-21 03:27:27  See how strong was the #Earthquake of Feb 20, ...   
6  2023-02-21 03:27:11  More difficult news today on top of struggles ...   

                                            hashtags  like_count  rt_count  \
1  ['Hatay', 'earthquakes', 'Türkiye', 'TurkiyeQu...         1.0       0.0   
2  ['Turkey', 'earthquake', 'turkeyearthquake2023...         0.0       0.0   
3    ['turkeyearthquake2023', 'earthquake', 'Syria']         0.0       0.0   
5  ['Earthquake', 'Hatay', 'Turkey', 'turkeyearth...         0.0       0.0   
6    ['Türkiye', 'Syria', 'earthquake', 'Canadians']         1.0       0.0   

   followers_count isVerified language coordinates plac

In [38]:
print(dataset_en['content'][1])

New search &amp; rescue work is in progress in #Hatay after two more #earthquakes hit #Türkiye’s southeastern province.  #TurkiyeQuakes #Turkey-#Syria  #Earthquake #turkeyearthquake2023  https://t.co/sd4WHByiQs


In [45]:
###########################################################################################################################################################
# CONTENT CLEANING
###########################################################################################################################################################

import re
import string

# FIRST STEP: LINK REMOVAL
def remove_links(text):
    text = re.sub(f"http\S+", "", text) 
    return text

# SECOND STEP: PUNCTUATION REMOVAL
def remove_punctuation(text):
    text = re.sub(f"[^\w\d\s]+", "", text)
    return text

# THIRD STEP: LOWERCASE
def lowercase(text):
    text = text.lower()
    return text

# FOURTH STEP: REMOVE STOPWORDS
def remove_stopwords(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# FIFTH STEP: LEMMATIZATION
def lemmatize(text):
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)
    return text


# Apply the function to the content column
dataset_en['content'] = dataset_en['content'].apply(lambda x: remove_links(x))
dataset_en['content'] = dataset_en['content'].apply(lambda x: remove_punctuation(x))
dataset_en['content'] = dataset_en['content'].apply(lambda x: lowercase(x))
dataset_en['content'] = dataset_en['content'].apply(lambda x: remove_stopwords(x))
dataset_en['content'] = dataset_en['content'].apply(lambda x: lemmatize(x))

In [47]:
# Print the first 5 rows of the dataframe with only english tweets
print(dataset_en['content'][1])

new search amp rescue work progress hatay two earthquake hit türkiyes southeastern province turkiyequakes turkeysyria earthquake turkeyearthquake2023


In [None]:
# %pip install -U nltk
# %pip install -U spacy

python -m spacy download en_core_web_lg

In [53]:
# SPACY NLP TAGGER
import spacy

# Load the large english model
nlp = spacy.load('en_core_web_lg')

# Create a function to get the pos tag of each word
def get_pos_tag(text):
    doc = nlp(text)
    pos_tagged = []
    for token in doc:
        pos_tagged.append((token.text, token.pos_))
    return pos_tagged

# Apply the function to the content column
dataset_en['content'] = dataset_en['content'].apply(lambda x: get_pos_tag(x))


# Create a function to get the wordnet pos tag of each word
# def get_wordnet_pos(pos_tag):
#     if pos_tag.startswith('J'):
#         return wordnet.ADJ
#     elif pos_tag.startswith('V'):
#         return wordnet.VERB
#     elif pos_tag.startswith('N'):
#         return wordnet.NOUN
#     elif pos_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN # We will use NOUN as default


In [54]:
# Print the result
print(dataset_en['content'][1])

[('new', 'ADJ'), ('search', 'NOUN'), ('amp', 'NOUN'), ('rescue', 'NOUN'), ('work', 'NOUN'), ('progress', 'NOUN'), ('hatay', 'PROPN'), ('two', 'NUM'), ('earthquake', 'NOUN'), ('hit', 'VERB'), ('türkiyes', 'PROPN'), ('southeastern', 'PROPN'), ('province', 'PROPN'), ('turkiyequakes', 'PROPN'), ('turkeysyria', 'PROPN'), ('earthquake', 'PROPN'), ('turkeyearthquake2023', 'PROPN')]


In [None]:
# Stanford NER 
java_path = 'C:\\Program Files (x86)\\Java\\jre1.8.0_361\\bin\\java.exe'
os.environ['JAVAHOME'] = java_path

from nltk.tag.stanford import StanfordNERTagger

# Specify the path to the jar file and the model file
model_path = 'stanford-ner.jar'
ner_path ='english.conll.4class.distsim.crf.ser.gz'

# Create a tagger object
tagger = StanfordNERTagger(ner_path, model_path, encoding='utf-8')

# Tag named entities in all tweets
tagged_tweets = [tagger.tag(tweet.split()) for tweet in dataset_en['content']]

# Print the first 5 rows of the tagged tweets
print(tagged_tweets[:5])

In [22]:
# Barplot of the tweets per day
plt.figure(figsize=(10, 5))

sns.countplot(x='date', data=dataset_en, palette='hls')
plt.xticks(rotation=90)
plt.xlabel('Date', fontsize=15)
plt.ylabel('Number of tweets', fontsize=15)
plt.title('Tweets per day', fontsize=20)
plt.show()

# Distribution of the tweets per hour
plt.figure(figsize=(10, 5))

sns.countplot(x=dataset_en['date'].str[11:13], data=dataset_en, palette='hls')
plt.xlabel('Hour', fontsize=15)
plt.ylabel('Number of tweets', fontsize=15)
plt.title('Tweets per hour', fontsize=20)
plt.show()


KeyboardInterrupt: 

In [None]:
# PREPROCESSING TO-DO LIST: 
#
# 1. Change like_count to int
# 2. Change retweet_count to int
# 3. Change date format
# 4. Move time into a new column
# 5. Translate the text to English
# 6. Remove the rows with no text
# 7. Remove the rows with no hashtags
# 8. Remove the rows with no mentions
# 9. Remove the rows with no links
# 10. Remove the rows with no media
# 11. Remove the rows with no retweet_count

# CONSIDERATIONS:
# 1. Retweets are important to consider because they are a form of engagement (do not consider tweets with no retweets, but consider retweets as a further request for help)
