# This a basic experimentation with SVC, Logistic Regression and Decision Tree trained and tested on the data. The tweets' texts are first converted to word vectors and then these vectors replaced the 'tweets_Text' column in the dataset so that now the features are numeric values. Also, the 'tweets_Date' column was dropped and not considered in the training set for now due to its ambigous value. Other real valued columns were normalized. 

In [27]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Replace the dataset path with your own appropiate path

data = pd.read_csv('Data-clean/Jan-Oct-2020_std_txtCleaned_w_label.csv')


In [3]:
data.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweets_Date,tweets_Text,tweets_Replies,tweets_Retweets,tweets_Likes,has_image,has_Link,tweets_ResponseCategory
0,0,0,Sep 30,initial reports suggest three rockets struck b...,96.0,306.0,424.0,0,0,1
1,1,1,Sep 30,senate approved stopgap spending bill order av...,164.0,138.0,341.0,1,1,2
2,2,2,Sep 30,seven people shot funeral home milwaukee treat...,88.0,191.0,296.0,1,1,1
3,3,3,Sep 30,clare bronfman heiress seagram liquor fortune ...,64.0,237.0,651.0,1,1,0
4,4,4,Sep 30,trump plans participate next two presidential ...,491.0,155.0,510.0,1,0,3


In [4]:
labels = data["tweets_ResponseCategory"]
data.drop("tweets_ResponseCategory", axis=1, inplace=True)

In [5]:
data[data['tweets_Text'].isnull()].index.tolist()

[]

In [7]:
# data.loc[2811]

In [9]:
# data = data.drop([2811])

In [10]:
cv = CountVectorizer()
vectorized_tweets = cv.fit_transform(data['tweets_Text'])
count_vect_df = pd.DataFrame(vectorized_tweets.todense(), columns=cv.get_feature_names())

new_data=pd.concat([data, count_vect_df], axis=1)


In [11]:
new_data.drop(['tweets_Text', 'tweets_Date'], axis=1, inplace=True)

In [12]:
new_data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweets_Replies,tweets_Retweets,tweets_Likes,has_image,has_Link,017,10,100,...,zealands,zef,zero,zeta,zetas,zion,zone,zones,zoo,zverev
0,0,0,96.0,306.0,424.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,164.0,138.0,341.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,88.0,191.0,296.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,64.0,237.0,651.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,4,491.0,155.0,510.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# for i in range(len(data)):
# #     columnsData = dfObj.loc[ i , 'Age' ]
#     print(type(data.loc[i,'tweets_Replies']))
# #     if type(data.loc[i,'tweets_Replies']) == str:
# #         print(i)

In [15]:
cols_to_norm = ['tweets_Replies', 'tweets_Retweets', 'tweets_Likes']
new_data[cols_to_norm] = preprocessing.MinMaxScaler().fit_transform(new_data[cols_to_norm])
new_data[cols_to_norm].describe()

Unnamed: 0,tweets_Replies,tweets_Retweets,tweets_Likes
count,2811.0,2811.0,2811.0
mean,0.048155,0.027109,0.044302
std,0.073818,0.045918,0.068958
min,0.0,0.0,0.0
25%,0.01321,0.007624,0.012749
50%,0.025099,0.014034,0.022728
75%,0.052368,0.029022,0.047824
max,1.0,1.0,1.0


In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(new_data, labels, test_size=0.2)

In [17]:
X_train.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweets_Replies,tweets_Retweets,tweets_Likes,has_image,has_Link,017,10,100,...,zealands,zef,zero,zeta,zetas,zion,zone,zones,zoo,zverev
2370,2371,2371,0.018117,0.008577,0.016381,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
146,146,146,0.046613,0.017586,0.042444,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1328,1328,1328,0.008681,0.019839,0.02176,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2039,2040,2040,0.024156,0.01633,0.015923,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1742,1742,1742,0.004152,0.002382,0.004895,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1. Try with Logistic Regression

In [18]:
# if max_iter is reached without convergence, we can just increase the max_iter  
solver_list = ['newton-cg', 'sag','saga','lbfgs']
score_list = []
for s in solver_list:
    clf = LogisticRegression(solver=s, max_iter=1000)
    clf.fit(X_train, Y_train)
    score_list.append(clf.score(X_test, Y_test))
print(score_list)



[0.40852575488454707, 0.21847246891651864, 0.21847246891651864, 0.41207815275310833]


## 2. Try with Decision Tree(with and without cross-validation)

In [19]:
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

1.0

In [20]:
print(max(cross_val_score(clf, new_data, labels, cv=5)))

1.0


## 3. Try with SVM

In [21]:
clf = SVC()
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)



0.32149200710479575

### 4. Try with Neural Network

In [24]:
clf = MLPClassifier(random_state=1, max_iter=1000)

In [25]:
clf.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [26]:
clf.score(X_test, Y_test)

0.3783303730017762

### 5. Random Forrest

In [34]:
clf = RandomForestClassifier(max_depth=7, random_state=0)

In [35]:
clf.fit(X_train, Y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [36]:
clf.score(X_test, Y_test)

0.47424511545293074