In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Sentiment Analysis
from textblob import TextBlob

# NLP
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Word Cloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Machine Learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression

# Other
import os
import re
import datetime
import string
import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#  Importing the dataset
dataset = pd.read_csv('archive/tweets.csv', encoding='utf-8', header='infer')
# Print the first 5 rows of the dataframe
print(dataset.head())

# Print the shape of the dataframe
print(dataset.shape)
# Print information about the dataframe
print(dataset.info())


In [None]:
dataset_en = dataset[dataset['language'] == 'en'].copy()

# Print the shape of the dataframe with only english tweets (189626, 11)
print(dataset_en.shape)

# Transform the date column to datetime format
dataset_en['date'] = pd.to_datetime(dataset_en['date'])

# Transform the format of the date column
dataset_en['date'] = dataset_en['date'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Print the first 5 rows of the dataframe with only english tweets
print(dataset_en.head())

In [None]:
print(dataset_en['content'][1])

In [None]:
def clean_text(text):
    text = re.sub(f"http\S+", "", text)  # Remove links
    text = re.sub(f"[^\w\d\s]+", "", text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Split into words
    stops = set(stopwords.words("english"))  # Get stopwords
    text = [w for w in text if not w in stops]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]  # Lemmatize words
    text = " ".join(text)  # Join words back into a string
    return text
dataset_en['content'] =  dataset_en['content'].astype(str)
dataset_en['content'] = dataset_en['content'].apply(clean_text)


In [None]:
# Print the first 5 rows of the dataframe with only english tweets
print(dataset_en['content'][1])

In [None]:
# %pip install -U nltk
# %pip install -U spacy

!python -m spacy download en_core_web_lg

In [None]:
# SPACY NLP TAGGER
import spacy

# Load the large english model
nlp = spacy.load('en_core_web_lg')

# Create a function to get the pos tag of each word
def get_pos_tag(text):
    doc = nlp(text)
    pos_tagged = []
    for token in doc:
        pos_tagged.append((token.text, token.pos_))
    return pos_tagged

# Apply the function to the content column
dataset_en['content'] = dataset_en['content'].apply(lambda x: get_pos_tag(x))


# Create a function to get the wordnet pos tag of each word
# def get_wordnet_pos(pos_tag):
#     if pos_tag.startswith('J'):
#         return wordnet.ADJ
#     elif pos_tag.startswith('V'):
#         return wordnet.VERB
#     elif pos_tag.startswith('N'):
#         return wordnet.NOUN
#     elif pos_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN # We will use NOUN as default


In [None]:
# Print the result
print(dataset_en['content'][1])

In [None]:
# Stanford NER 
java_path = 'C:\\Program Files (x86)\\Java\\jre1.8.0_361\\bin\\java.exe'
os.environ['JAVAHOME'] = java_path

from nltk.tag.stanford import StanfordNERTagger

# Specify the path to the jar file and the model file
model_path = 'stanford-ner.jar'
ner_path ='english.conll.4class.distsim.crf.ser.gz'

# Create a tagger object
tagger = StanfordNERTagger(ner_path, model_path, encoding='utf-8')

# Tag named entities in all tweets
tagged_tweets = [tagger.tag(tweet.split()) for tweet in dataset_en['content']]

# Print the first 5 rows of the tagged tweets
print(tagged_tweets[:5])

In [None]:
# Barplot of the tweets per day
plt.figure(figsize=(10, 5))

sns.countplot(x='date', data=dataset_en, palette='hls')
plt.xticks(rotation=90)
plt.xlabel('Date', fontsize=15)
plt.ylabel('Number of tweets', fontsize=15)
plt.title('Tweets per day', fontsize=20)
plt.show()

# Distribution of the tweets per hour
plt.figure(figsize=(10, 5))

sns.countplot(x=dataset_en['date'].str[11:13], data=dataset_en, palette='hls')
plt.xlabel('Hour', fontsize=15)
plt.ylabel('Number of tweets', fontsize=15)
plt.title('Tweets per hour', fontsize=20)
plt.show()


In [None]:
# PREPROCESSING TO-DO LIST: 
#
# 1. Change like_count to int
# 2. Change retweet_count to int
# 3. Change date format
# 4. Move time into a new column
# 5. Translate the text to English
# 6. Remove the rows with no text
# 7. Remove the rows with no hashtags
# 8. Remove the rows with no mentions
# 9. Remove the rows with no links
# 10. Remove the rows with no media
# 11. Remove the rows with no retweet_count

# CONSIDERATIONS:
# 1. Retweets are important to consider because they are a form of engagement (do not consider tweets with no retweets, but consider retweets as a further request for help)
