In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split,cross_val_score # for splitting the data into train and test samples
from sklearn.metrics import classification_report, confusion_matrix   # for model evaluation metrics
from sklearn.svm import SVC # for Support Vector Classification model
from sklearn import svm
from sklearn.metrics import accuracy_score
import plotly.express as px  # for data visualization
import plotly.graph_objects as go # for data visualization
from sklearn.preprocessing import StandardScaler

In [2]:
# get information about Senate and House, not including influencers
house_df = pd.read_csv('data/congress_twitter_117th_house.csv', sep=',', header=0)
# house_df = pd.read_csv('../../data/congress_twitter_117th_house.csv', sep=',', header=0)
# keep their user name as ID, and also keep "State", and "Party" columns
house_df["user_name"] = house_df["Link"].str.replace("https://twitter.com/", "", regex=False)
house_df = house_df[["user_name", "State", "Party"]].copy()
# senate_df = pd.read_csv('../../data/congress_twitter_117th_senate.csv', sep=',', header=0)
senate_df = pd.read_csv('data/congress_twitter_117th_senate.csv', sep=',', header=0)
# keep their user name as ID, and also keep "State", and "Party" columns
senate_df["user_name"] = senate_df["Link"].str.replace("https://twitter.com/", "", regex=False)
senate_df = senate_df[["user_name", "State", "Party "]].copy()
senate_df = senate_df.rename(columns= {'Party ': 'Party'})
house_senate_df = pd.concat([house_df,senate_df]).reset_index().drop(columns = ['index'])

In [9]:
# get username including influencers
with open('user_lookup.json') as json_file:
    data = json.load(json_file)
# filtered_index = [k for k in data if data[k] in house_senate_list]
data = pd.DataFrame.from_dict({k:[v] for k,v in data.items()}).transpose()
data.columns = ['user_name']
data = data.reset_index().drop(columns = ['index'])

# get the profiles including influencers
# profile = np.loadtxt('user_profile_0_72/user_profile_72')
profile = np.loadtxt('user_profile_151')

profiles = pd.DataFrame(np.matrix(profile))
profiles['profile'] = profiles.values.tolist()
profiles =  pd.DataFrame(profiles['profile'])
profiles = profiles.reset_index().drop(columns = ['index'])

In [10]:
#get full information on Senate and House, not including influencers
temp = pd.concat([data, profiles], axis=1)
combined =temp.merge(house_senate_df, left_on='user_name', right_on='user_name',how = 'inner')
# combined['Party'].replace({'D': 1, 'R': 0}, inplace=True) #D: 1, R:0
combined = combined[combined['Party']!='I'] #there are only 2 people in party I

In [107]:
# with open("raw_tables_2023_03_05/raw_tweets", "rb") as raw_timeline_file:
#     tweet_df = pd.read_pickle(raw_timeline_file)
#     # tweet_df.to_csv('raw_tweets_out.csv')
#     print(tweet_df["content"].head(5))
# tweet_df_sample = tweet_df.sample(n = 10000)

# import numpy as np
# mock_profile_df = tweet_df_sample[["user_name"]].copy()
# mock_profile_df["profile"] = [np.random.rand(5) for i in range(len(mock_profile_df))]
# print(mock_profile_df.head())

In [11]:
X = np.array(combined["profile"].values.tolist())
y = np.array(combined["Party"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)
# scaler = StandardScaler().fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)
# Create a SVM classifier with a linear kernel
model = SVC(kernel='rbf')
# Perform 5-fold cross-validation on the training data
scores = cross_val_score(model, X_train, y_train, cv=5)
# Print the cross-validation scores
print("Cross-validation scores:", scores)
# Train the classifier on the training data
model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = model.predict(X_test)
# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print(classification_report(y_test,y_pred))#Output
print(confusion_matrix(y_test,y_pred))

Cross-validation scores: [0.96470588 0.96470588 0.95238095 0.96428571 0.96428571]
Accuracy: 0.9811320754716981
              precision    recall  f1-score   support

           D       0.98      0.98      0.98        59
           R       0.98      0.98      0.98        47

    accuracy                           0.98       106
   macro avg       0.98      0.98      0.98       106
weighted avg       0.98      0.98      0.98       106

[[58  1]
 [ 1 46]]


In [12]:
from sklearn.model_selection import GridSearchCV
  
# Define the parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [10,1, 0.1, 0.01, 0.001]}
# Create a grid search object
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=1)
# Fit the grid search object to the training data
grid.fit(X_train, y_train)
# Get the best parameters
best_params = grid.best_params_
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
print(grid.score(X_test,y_test))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
{'C': 100, 'gamma': 0.01}
SVC(C=100, gamma=0.01)
0.9622641509433962


In [8]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))#Output

[[48  4]
 [ 7 47]]
              precision    recall  f1-score   support

           D       0.87      0.92      0.90        52
           R       0.92      0.87      0.90        54

    accuracy                           0.90       106
   macro avg       0.90      0.90      0.90       106
weighted avg       0.90      0.90      0.90       106

