In [None]:
import pandas as pd
import numpy as np
import scipy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
final_merged = pd.read_csv('/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Data/final_merged.csv')

In [None]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from multiprocessing import Pool

nltk.download('stopwords')
nltk.download('wordnet')

#initializes TweetTokenizer and other preprocessing tools
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

#function preprocess a singular tweet
def preprocess_tweet(tweet):
    tokens = tokenizer.tokenize(tweet) #tokenizes
    tokens = [token for token in tokens if not token.startswith('http')] #removes URLs
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] #removes stopwords and lemmatize
    return tokens

#function that preprocesses tweets in parallel
def preprocess_tweets_parallel(tweets, num_processes=2):
    with Pool(num_processes) as pool:
        preprocessed_tweets = pool.map(preprocess_tweet, tweets)
    return preprocessed_tweets


In [None]:
tweets = final_merged['text'].tolist()

#preprocess tweets in parallel
preprocessed_tweets = preprocess_tweets_parallel(tweets, num_processes=2)

In [None]:
from gensim.models import Word2Vec

#trains the Word2Vec model
model = Word2Vec(sentences=preprocessed_tweets, vector_size=100, window=5, min_count=100, workers=2) #these parameters can be adjusted for trial and error


In [None]:
#model.save('/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Word2Vec Models/w2v100_tweets_model.model')

In [None]:
from sklearn.cluster import KMeans

#gets the word vectors
word_vectors = model.wv.vectors

#k-means clustering
num_clusters = 500  #adjustable
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(word_vectors)

#creates a dictionary that maps words to their respective clusters
word_to_cluster = {word: kmeans.labels_[i] for i, word in enumerate(model.wv.index_to_key)}

In [None]:
import json

word_to_cluster_model = {word: int(cluster) for word, cluster in word_to_cluster.items()}

with open('/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Models/word_to_cluster_model.json', 'w') as file:
    json.dump(word_to_cluster_model, file)

In [None]:
#this function counts the number of clusters a tweet belongs to and then represents that tweet as a fraction of the total clusters
def tweet_to_cluster_distribution(tweet):
    clusters = [word_to_cluster[word] for word in tweet if word in word_to_cluster]
    if len(clusters) == 0:
        return np.zeros(num_clusters)  # Return zero vector if no known words
    cluster_counts = np.bincount(clusters, minlength=num_clusters)
    return cluster_counts / len(clusters)

#runs the function on all tweets
tweet_distributions = [tweet_to_cluster_distribution(tweet) for tweet in preprocessed_tweets]

#aggregates each users tweet-distributions and represents each user as the mean of all their tweet fractions
def aggregate_user_distributions(tweets):
    tweet_distributions = [tweet_to_cluster_distribution(preprocess_tweet(tweet)) for tweet in tweets]
    return np.mean(tweet_distributions, axis=0) if len(tweet_distributions) > 0 else np.zeros(num_clusters)

user_distributions = final_merged.groupby('screen_name')['text'].apply(lambda x: aggregate_user_distributions(x.tolist()))

In [None]:
user_distributions = user_distributions.reset_index()

#dataframe with less columns
user_ideology = final_merged[['screen_name', 'nominate_dim1']]

#adds aggregate distributions to a new dataframe
user_data = pd.merge(user_distributions, user_ideology, on='screen_name')

In [None]:
#creates a binary conservative vs liberal variable
#1 if conservative, 0 if liberal
def threshold_function(x):
    return 1 if x >= 0 else 0

bin_user = user_data.assign(conserv=user_data['nominate_dim1'].apply(threshold_function))
print(bin_user)

In [None]:
#creates a normalized version of the NOMINATE scores ranging from discrete values of 1-7
user_data['normal_nom'] = ((user_data['nominate_dim1'] + 1) // (2/6)) + 1
print(user_data)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#subset_bin_data = bin_user.sample(frac=.1)

#creates the feature matrix
X = np.stack(bin_user['text'].values)
y = bin_user['conserv']

#encodes labels
y = pd.get_dummies(y).values.argmax(1)

#creates training and testing sets. Testing on 20% of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#trains the logistic regression classifier using the 'saga' solver
#saga is more efficient but might lower our accuracy a bit
clf = LogisticRegression(solver='saga', max_iter=1000, n_jobs=-2)
clf.fit(X_train, y_train)

#prediction
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

#results:
#binary accuracy: 97.3
#1-5 accuracy: 85.4
#1-7 accuracy: 76.1

In [None]:
import joblib

joblib.dump(clf, '/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Models/logistic_regression_model.pkl')

In [None]:
user_data.drop(columns=['text']).to_csv('/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Data/user_data.csv')
bin_user.drop(columns=['text']).to_csv('/content/drive/MyDrive/Applied Deep Learning Poli Sci/Foley and Dorner Repo/Data/bin_user.csv')