In [3]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from ast import literal_eval
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import SGDClassifier

### Todo:  Load processed tweets df and use processed tweets column

In [4]:
tweets_df = pd.read_csv("./raw_tweets.csv", converters={"hashtags": literal_eval})
tweets_df.head()

Unnamed: 0,State,Senator,Party,Multiple,Twitter Handle,Twitter Link,date,tweet,hashtags,link,urls
0,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-10 10:54:50,"I look forward to working with the Consortium,...",[],https://twitter.com/SenShelby/status/136967813...,[]
1,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-10 10:54:13,Pleased to announce that an AL Consortium has ...,[],https://twitter.com/SenShelby/status/136967798...,['https://www.shelby.senate.gov/public/index.c...
2,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-09 17:06:05,Pleased with the progress at the Port of Mobil...,[alabama],https://twitter.com/SenShelby/status/136940917...,[]
3,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-06 12:57:31,I am disappointed that we were blocked at ever...,[],https://twitter.com/SenShelby/status/136825946...,['https://www.shelby.senate.gov/public/index.c...
4,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-06 12:56:38,The bill does nothing to get kids back in clas...,[],https://twitter.com/SenShelby/status/136825924...,[]


In [5]:
tweets_df.loc[tweets_df.Party == "Independent", "Party"] = "Democrat"

In [6]:
tweets_df["Party"].value_counts()

Democrat      74579
Republican    54670
Name: Party, dtype: int64

In [10]:
def get_sentiments(df):
    tweet = df["tweet"]
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(tweet)
    df["neg"] = vs["neg"]
    df["neu"] = vs["neu"]
    df["pos"] = vs["pos"]
    df["compound"] = vs["compound"]
    return df

In [11]:
tweets_df = tweets_df.apply(get_sentiments, axis=1)
tweets_df.head()

Unnamed: 0,State,Senator,Party,Multiple,Twitter Handle,Twitter Link,date,tweet,hashtags,link,urls,neg,neu,pos,compound
0,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-10 10:54:50,"I look forward to working with the Consortium,...",[],https://twitter.com/SenShelby/status/136967813...,[],0.0,0.812,0.188,0.7351
1,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-10 10:54:13,Pleased to announce that an AL Consortium has ...,[],https://twitter.com/SenShelby/status/136967798...,['https://www.shelby.senate.gov/public/index.c...,0.0,0.676,0.324,0.936
2,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-09 17:06:05,Pleased with the progress at the Port of Mobil...,[alabama],https://twitter.com/SenShelby/status/136940917...,[],0.0,0.709,0.291,0.8934
3,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-06 12:57:31,I am disappointed that we were blocked at ever...,[],https://twitter.com/SenShelby/status/136825946...,['https://www.shelby.senate.gov/public/index.c...,0.129,0.672,0.199,0.4215
4,Alabama,Richard Shelby,Republican,0,SenShelby,https://twitter.com/SenShelby,2021-03-06 12:56:38,The bill does nothing to get kids back in clas...,[],https://twitter.com/SenShelby/status/136825924...,[],0.15,0.85,0.0,-0.5267


In [12]:
tweets_df.to_csv("./tweets_with_sentiment.csv")

In [13]:
X = tweets_df[["tweet", "neg", "neu", "pos", "compound"]]
X.head()

Unnamed: 0,tweet,neg,neu,pos,compound
0,"I look forward to working with the Consortium,...",0.0,0.812,0.188,0.7351
1,Pleased to announce that an AL Consortium has ...,0.0,0.676,0.324,0.936
2,Pleased with the progress at the Port of Mobil...,0.0,0.709,0.291,0.8934
3,I am disappointed that we were blocked at ever...,0.129,0.672,0.199,0.4215
4,The bill does nothing to get kids back in clas...,0.15,0.85,0.0,-0.5267


In [14]:
y = tweets_df["Party"]
y.head()

0    Republican
1    Republican
2    Republican
3    Republican
4    Republican
Name: Party, dtype: object

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
numeric_features = ["neg", "neu", "pos", "compound"]
numeric_transformer = MinMaxScaler()

text_features = ["tweet"]
text_transformer = Pipeline([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer())
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric", numeric_transformer, numeric_features),
    ("text", text_transformer, text_features)
])


In [32]:
X_train.head()

Unnamed: 0,tweet,neg,neu,pos,compound
108467,I joined Sen. Murphy to introduce the bipartis...,0.158,0.655,0.187,0.1531
54073,"#HR1, the For the People Act, makes it easier ...",0.041,0.819,0.14,0.6486
125799,The incredible folks at #HSI continue to work ...,0.315,0.605,0.08,-0.8122
84994,We cannot leave behind the millions of America...,0.16,0.753,0.088,-0.4857
66288,"Nevadenses, el NHLC ha recopilado información ...",0.0,0.906,0.094,0.34


In [37]:
X_train["tweet"].shape

(103399,)

In [30]:
X_train.shape

(103399, 5)

In [31]:
ytrain.shape

(103399, 1)

In [28]:
preprocessor.fit(X_train, ytrain)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 103399 and the array at index 1 has size 1

In [26]:
ytrain = y_train.reshape(-1, 1)

In [27]:
ytrain.shape

(103399, 1)

In [17]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
le.inverse_transform([0, 1])

array(['Democrat', 'Republican'], dtype=object)

In [20]:
X_train.shape

(103399, 5)

In [21]:
y_train.shape

(103399,)

In [18]:
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
clf = Pipeline([
    ("preprocessor", preprocessor),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))
])

clf.fit(X_train, y_train)

""" 
parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(clf, parameters, cv=10, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)
print(gs_clf.score(X_test, y_test))
#gs_clf.pred(X_test)
for param_name in sorted(parameters.keys()):
     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
     """

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 103399 and the array at index 1 has size 1