# Model Building and Data Preprocessing

In [12]:
import pandas as pd

training_data = pd.read_csv('data_clean.csv')

In [13]:
# Drop rows where 'text_clean' is NaN
training_data = training_data.dropna(subset=['text_clean'])

In [14]:
# Get sentiment of tweets to help identify political views

from textblob import TextBlob

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

training_data[['sentiment_polarity', 'sentiment_subjectivity']] = training_data['text_clean'].apply(lambda x: pd.Series(get_sentiment(x)))

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation


# Step 1: Sentiment Features (Already Included)
sentiment = csr_matrix(training_data[['sentiment_polarity', 'sentiment_subjectivity', 'tweet_length_chars', 
                                      'tweet_length_words', 'hashtag_length_chars', 'hashtag_length_words', 
                                      'text_clean_length_chars', 'text_clean_length_words']])

# Normalize Sentiment Features
sentiment_dense = sentiment.toarray()  # Convert to dense format for scaling
scaler = MinMaxScaler()
sentiment_normalized = scaler.fit_transform(sentiment_dense)
sentiment = csr_matrix(sentiment_normalized)  # Convert back to sparse matrix

# Step 2: Text Features (text_clean and full_text)
# TF-IDF for 'text_clean'
text_clean_vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words=['english','swedish','norwegian','dutch','danish'], max_df=0.01, min_df=2)
text_clean_features = text_clean_vectorizer.fit_transform(training_data['text_clean'])

lda = LatentDirichletAllocation(n_components=10, random_state=42)
topic_probs = lda.fit_transform(text_clean_features)
# Add topic probabilities to features
topic_features = csr_matrix(topic_probs)

# TF-IDF for 'full_text'
training_data['full_text'] = training_data['full_text'].fillna('')  # Replace NaN with an empty string
full_text_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=2)
full_text_features = full_text_vectorizer.fit_transform(training_data['full_text'])

# Step 3: Hashtag Features
training_data['hashtags'] = training_data['hashtags'].fillna('')  # Replace NaN
hashtag_vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_df=0.9, min_df=2)
hashtag_features = hashtag_vectorizer.fit_transform(training_data['hashtags'])

# Step 4: in_reply_to_screen_name Features
# Replace NaN with 'missing'
training_data['in_reply_to_screen_name'] = training_data['in_reply_to_screen_name'].fillna('missing')

# If you prefer TF-IDF (if values are textual), you can uncomment:
reply_vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_df=0.9, min_df=2)
reply_features = reply_vectorizer.fit_transform(training_data['in_reply_to_screen_name'])

# Step 5: Categorical Features (country_user and gender_user)
categorical_encoder = OneHotEncoder(drop='first')
categorical_features = categorical_encoder.fit_transform(training_data[['country_user', 'gender_user']])

# Step 6: Combine All Features
X = hstack([text_clean_features, full_text_features, hashtag_features, sentiment, categorical_features, reply_features, topic_features])

# Define Target Variable
y = training_data['pol_spec_user']  # Target variable

# Print shapes for verification
print(f"Text Clean Features Shape: {text_clean_features.shape}")
print(f"Full Text Features Shape: {full_text_features.shape}")
print(f"Hashtag Features Shape: {hashtag_features.shape}")
print(f"Sentiment Features Shape: {sentiment.shape}")
print(f"Categorical Features Shape: {categorical_features.shape}")
print(f"Reply Features Shape: {reply_features.shape}")
print(f"Combined Features Shape: {X.shape}")

# Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Text Clean Features Shape: (405282, 674945)
Full Text Features Shape: (405282, 947138)
Hashtag Features Shape: (405282, 22574)
Sentiment Features Shape: (405282, 8)
Categorical Features Shape: (405282, 7)
Reply Features Shape: (405282, 9913)
Combined Features Shape: (405282, 1654595)


#### Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

model = LogisticRegression(verbose=0,solver='liblinear', penalty='l2', max_iter=1000, C=10)  # Use 1000 iterations to ensure convergence
model.fit(X_train, y_train)

# predict on the test set
y_pred = model.predict(X_test)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7805

Classification Report:
              precision    recall  f1-score   support

      Center       0.79      0.71      0.75     20862
 Independent       0.83      0.52      0.64       141
        Left       0.78      0.84      0.80     34719
       Right       0.78      0.76      0.77     25335

    accuracy                           0.78     81057
   macro avg       0.79      0.71      0.74     81057
weighted avg       0.78      0.78      0.78     81057



#### Neural Network

In [56]:
from sklearn.svm import LinearSVC

model = LinearSVC(C=1, intercept_scaling=10, loss='hinge', verbose=1, max_iter=1000, dual=True)
model.fit(X_train, y_train)

# Predict on the test data
y_pred_svm = model.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")

[LibLinear]....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -49547.967386
nSV = 160569
....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -292.549843
nSV = 1608
....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -67209.825479
nSV = 193862
....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -54234.579387
nSV = 170521
SVM Accuracy: 0.7899




In [18]:
test_data = pd.read_excel('test_data.xlsx')

In [19]:
from lemmatizer import lemmatize_tweet

test_data['text_clean'] = test_data['full_text'].apply(lemmatize_tweet)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felix.hammond/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felix.hammond/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/felix.hammond/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
test_data.to_csv('test_data_clean.csv', index=False)

In [57]:
from textblob import TextBlob

test_data['tweet_length_chars'] = test_data['full_text'].str.len()
test_data['tweet_length_words'] = test_data['full_text'].str.split().str.len()

# calculate hashtag length in characters and words
test_data['hashtag_length_chars'] = test_data['hashtags'].fillna("").str.len()
test_data['hashtag_length_words'] = test_data['hashtags'].fillna("").str.split().str.len()

test_data['text_clean_length_chars'] = test_data['text_clean'].str.len() # character length
test_data['text_clean_length_words'] = test_data['text_clean'].str.split().apply(len) # word length

def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

test_data[['sentiment_polarity', 'sentiment_subjectivity']] = test_data['text_clean'].apply(lambda x: pd.Series(get_sentiment(x)))

sentiment_test = csr_matrix(test_data[['sentiment_polarity', 'sentiment_subjectivity', 'tweet_length_chars', 'tweet_length_words',
                                     'hashtag_length_chars', 'hashtag_length_words', 'text_clean_length_chars', 'text_clean_length_words']])

from sklearn.preprocessing import MinMaxScaler

# Convert sentiment to dense format to apply scaler
sentiment_dense = sentiment_test.toarray()

# Apply Min-Max Scaler
scaler = MinMaxScaler()
sentiment_normalized = scaler.fit_transform(sentiment_dense)

# Convert back to sparse matrix if needed
sentiment_test = csr_matrix(sentiment_normalized)

text_features_test = vectorizer.transform(test_data['text_clean'])

# TF-IDF for 'full_text'
full_text_features = full_text_vectorizer.transform(test_data['full_text'])

# Replace NaN with an empty string
test_data['hashtags'] = test_data['hashtags'].fillna('')

hashtag_features_test = hashtag_vectorizer.transform(test_data['hashtags'])

topic_probs = lda.transform(text_features_test)
# Add topic probabilities to features
topic_features = csr_matrix(topic_probs)

# Apply One-Hot Encoding (if treating as categorical)
test_data['in_reply_to_screen_name'] = test_data['in_reply_to_screen_name'].fillna('')
reply_features = reply_vectorizer.transform(test_data['in_reply_to_screen_name'])

# Step 2: One-Hot Encode Categorical Features
categorical_features_test = encoder.transform(test_data[['country_user', 'gender_user']])
test_features = hstack([text_features_test, full_text_features, 
                        hashtag_features_test, sentiment_test, categorical_features_test, reply_features, topic_features])  # Combine sparse matrices

In [58]:
test_predictions = model.predict(test_features)

In [59]:
submission_df = pd.DataFrame({
    'ID': test_data['Id'],  # Ensure 'Id' is in the test set
    'pol_spec_user': test_predictions  # Predicted political views
})

submission_df.to_csv('submission.csv', index=False)

print("Submission file created: 'submission.csv'")

Submission file created: 'submission.csv'
