In [23]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [18]:
def load_tweets(path, flag):
    df = pd.read_csv(path)
    df = df[['tweet_text']].rename({'tweet_text':'text'}, axis=1)
    df['class'] = flag
    return df

def load_data():
    df1 = load_tweets('../data/raw/pro-trump-tweets.csv', 0)
    df2 = load_tweets('../data/raw/pro-biden-tweets.csv', 1)
    return pd.concat([df1, df2], ignore_index=True)

In [20]:
corpus = load_data()

print(corpus.shape)
corpus.head()

(3582, 2)


Unnamed: 0,text,class
0,80 Million Unsolicited Ballots are impossible ...,0
1,Primary threat from #Laura in #NOLA is storm ...,0
2,Heads up! Today is the last day to prepare fo...,0
3,Hurricane #Laura is moving quickly. Be sure ...,0
4,Texans should heed guidance from local offici...,0


In [21]:
corpus.dtypes

text     object
class     int64
dtype: object

In [31]:
analyzer = SentimentIntensityAnalyzer()
scores = []
for t in corpus['text']:
    scores.append(analyzer.polarity_scores(t))
scores = pd.DataFrame(scores)

In [60]:
df = corpus.join(scores)
df = df[['text', 'neg', 'neu', 'pos', 'compound', 'class']]

print(df.shape)
df.head()

(3582, 6)


Unnamed: 0,text,neg,neu,pos,compound,class
0,80 Million Unsolicited Ballots are impossible ...,0.324,0.565,0.111,-0.9134,0
1,Primary threat from #Laura in #NOLA is storm ...,0.139,0.861,0.0,-0.5267,0
2,Heads up! Today is the last day to prepare fo...,0.0,1.0,0.0,0.0,0
3,Hurricane #Laura is moving quickly. Be sure ...,0.0,0.897,0.103,0.3182,0
4,Texans should heed guidance from local offici...,0.0,0.915,0.085,0.0772,0


In [35]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/chris/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
converter = TfidfVectorizer(max_features=1500, stop_words=stopwords.words('english'))
vecs = converter.fit_transform(df['text'])
tfidf = pd.DataFrame.sparse.from_spmatrix(vecs).sparse.to_dense()

In [54]:
vocab = converter.vocabulary_
vocab = {f'tfidf_{k}':v for k,v in sorted(vocab.items(), key=lambda x: x[1])}
tfidf.columns = vocab
tfidf.head()

Unnamed: 0,tfidf_00,tfidf_000,tfidf_10,tfidf_100,tfidf_11,tfidf_13,tfidf_14,tfidf_15,tfidf_150,tfidf_175,...,tfidf_years,tfidf_yes,tfidf_yesterday,tfidf_yet,tfidf_young,tfidf_youtube,tfidf_yuma,tfidf_yup,tfidf_z7sp1ymbft,tfidf_zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
df = tfidf.join(df).drop('text', axis=1)
df.head()

Unnamed: 0,tfidf_00,tfidf_000,tfidf_10,tfidf_100,tfidf_11,tfidf_13,tfidf_14,tfidf_15,tfidf_150,tfidf_175,...,tfidf_youtube,tfidf_yuma,tfidf_yup,tfidf_z7sp1ymbft,tfidf_zoom,neg,neu,pos,compound,class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.324,0.565,0.111,-0.9134,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.139,0.861,0.0,-0.5267,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.897,0.103,0.3182,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.915,0.085,0.0772,0


In [63]:
from sklearn.model_selection import train_test_split

X = df.drop('class', axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [64]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [65]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7667410714285714