In [47]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn
#!pip install nltk
#!pip install wordcloud
#!pip install sklearn
#pip install text_preprocessing


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [48]:
# Step 0. Load libraries and custom modules
# Dataframes and matrices ----------------------------------------------
import pandas as pd
import numpy as np
# Graphics -------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
plt.style.use('tableau-colorblind10')
# Mathematical functions -----------------------------------------------
from scipy.stats import norm
# Text processors ------------------------------------------------------
import unicodedata
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
# Preprocessing --------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# Text modeling --------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Model creating -------------------------------------------------------
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
# Metrics --------------------------------------------------------------
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import make_scorer
# Custom functions -----------------------------------------------------
# Se toma una fucnion creada en SRC
from textpreprocessing import clean_stopwords 


In [36]:
# We'll use a collection of sentiments for text analysis as a dataset
# This dataset was published in Saif M. Mohammad and Peter Turney. (2013), 
# ``Crowdsourcing a Word-Emotion Association Lexicon.'' 
# Computational Intelligence, 29(3): 436-465.
# It's only for research and educational purposes.
# URL: http://saifmohammad.com/WebPages/lexicons.html  
nrc = pd.read_csv('/workspace/NLP/data/raw/NRC.csv', names=['word','sentiment','polarity'])
nrc = nrc.query('polarity == 1')

In [38]:
# Step 1. Load the data
# Data extracted from https://www.thetrumparchive.com
# Data case: During the 2016 US presidential election, the candidate 
# Donald Trump used twitter to communicate with potential voters. 
# The campaign was during 2015-06-17 and 2016-11-08
# We'll try to analyze these campaign tweets from iPhone and Android
# Data description
# source -> device of origin
# id_str -> unique identifier
# text -> tweet text content
# created_at -> Date of creation, not including timezone
# retweet_count -> Count of retweets (difusion)
# in_reply_to_usr_id_str -> If it's a reply, grab the user id
# favourite_count -> Count of users that liked the tweet
# is_retweet -> If the post is a retweet  
# 1.1 Open data and get a glimpse
df_raw = pd.read_csv('/workspace/NLP/data/raw/trump_tweets (1).csv')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20761 entries, 0 to 20760
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   source                   20761 non-null  object 
 1   id_str                   20761 non-null  int64  
 2   text                     20761 non-null  object 
 3   created_at               20761 non-null  object 
 4   retweet_count            20761 non-null  int64  
 5   in_reply_to_user_id_str  2442 non-null   float64
 6   favorite_count           20761 non-null  int64  
 7   is_retweet               20761 non-null  bool   
dtypes: bool(1), float64(1), int64(3), object(3)
memory usage: 1.1+ MB


In [39]:
# 1.1 Sample some observations
df_raw.sample(10)

Unnamed: 0,source,id_str,text,created_at,retweet_count,in_reply_to_user_id_str,favorite_count,is_retweet
8100,Twitter Web Client,309304210636300288,@homeedmam I wish him the best of luck.,2013-03-06T14:07:25Z,0,,0,False
13350,Twitter for Android,635058369431252992,"Alabama was great last night, amazing people. ...",2015-08-22T11:58:08Z,3273,,7551,False
3145,Twitter Web Client,220170240875700224,Great poll numbers for @MittRomney just out--h...,2012-07-03T15:00:50Z,137,,25,False
3486,Twitter Web Client,199322476180815873,#CelebrityApprentice Boardrooms—can anything b...,2012-05-07T02:19:16Z,38,,6,False
20391,Twitter for iPhone,838861512999649286,'President Trump Congratulates Exxon Mobil for...,2017-03-06T21:19:04Z,12846,,70675,False
11024,Twitter for Android,484921289715376128,Do these very stupid politicians who got us in...,2014-07-04T04:46:59Z,87,,105,False
19065,Twitter for iPhone,919313707355787264,The Democrats in the Southwest part of Virgini...,2017-10-14T21:27:22Z,16003,,71175,False
13462,Twitter for Android,627841345789558788,I wish good luck to all of the Republican cand...,2015-08-02T14:00:16Z,6526,,9636,False
4414,Twitter for Android,406400091092971520,I am in Miami at Trump National Doral. Just ga...,2013-11-29T12:31:46Z,42,,68,False
10805,Twitter Web Client,496640871727906816,Doctors have already died treating Ebola http:...,2014-08-05T12:56:25Z,411,,389,False


In [53]:
# Step 2. Transform and wrangle the data
# 2.1 Make a copy
df_interim = df_raw.copy()

In [54]:
# 2.2 Get rid of uninformative columns
df_interim = df_interim.drop(['id_str','is_retweet','in_reply_to_user_id_str'], axis=1)

In [56]:
# 2.3 Convert columns to the right format
df_interim['created_at'] = df_interim['created_at'].astype('datetime64')
df_interim['source'] = pd.Categorical(df_interim['source'])

In [57]:
# 2.4 Filter dates for analysis
df_interim = df_interim.loc[(df_interim['created_at'] >= '2015-06-17') \
    & (df_interim['created_at'] <= '2016-11-08')]

In [58]:
# 2.5 Consider time is UTC, convert to EST, le quita 5 horas a la hora UTC
df_interim['created_at'] +=pd.Timedelta(-5,unit='h')

In [60]:
# 2.5 Filter source for analysis, vamos a usar solamente los origenes que viene iphon o android
df_interim = df_interim.loc[(df_interim['source'].str.contains('iPhone')) \
    | (df_interim['source'].str.contains('Android'))]
df_interim['source'] = df_interim['source'].cat.remove_unused_categories()

In [62]:
# 2.5 Save and create a copy for analysis
df_interim.to_csv('../data/interim/trump_tweets.csv', index=False)
df = df_interim.copy()

In [63]:
# Step 3. Perform EDA
# 3.1 Get basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3946 entries, 12072 to 18506
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   source          3946 non-null   category      
 1   text            3946 non-null   object        
 2   created_at      3946 non-null   datetime64[ns]
 3   retweet_count   3946 non-null   int64         
 4   favorite_count  3946 non-null   int64         
dtypes: category(1), datetime64[ns](1), int64(2), object(1)
memory usage: 158.0+ KB


In [None]:
# 3.2 Get a sample
df.sample(10)

In [None]:
# 3.3 Describe numerical and datetime data
df.describe(datetime_is_numeric=True)

In [None]:
# 3.4 Describe categorical data
df['source'].value_counts()

In [None]:
# 3.5 Get histograms for numerical data

In [None]:
# 3.6 Make a plot of tweets frequence rate by source

In [None]:
# 3.7 Get a glimpse of the most retweeted tweets

In [None]:
# 3.8 Get a glimpse of the most liked tweets

In [None]:
# 3.9 Let's get a glimpse of common words in the tweets' text

In [None]:
# 3.10 Process text to extract stopwords

In [None]:
# 3.11 Extract urls
url_pat = 'https://t.co/[A-Za-z\d]+|&amp;'


In [None]:
# 3.12 Extract special characters


In [None]:
# 3.13 Extract numbers


In [None]:
# 3.14 See the results


In [None]:
# 3.15 Let's see a wordcloud
