### Model Testing

In [4]:
import joblib
import pickle

# Load the fitted CountVectorizer
with open('src/fitted_vectorizer_combined.pkl', 'rb') as vectorizer_file:
    fitted_vectorizer = pickle.load(vectorizer_file)

# Load the Logistic Regression model
with open('src/logistic_model.pkl', 'rb') as model_file:
    logistic_model = pickle.load(model_file)

# Load the Logistic Regression modelwith Hyperparameters
with open('src/fitted_gridsearch.pkl', 'rb') as fitted_model_file:
    fitted_logistic_model = pickle.load(fitted_model_file)
    
# Load the fitted TF-IDFCountVectorizer
with open('src/fitted_tfidf_vectorizer_combined.pkl', 'rb') as tfidf_vectorizer_file:
    tfidf_fitted_vectorizer = pickle.load(tfidf_vectorizer_file)
    
# Load the TF-IDF Logistic Regression model
with open('src/logistic_model_tfidf.pkl', 'rb') as tfidf_model_file:
    tfidf_logistic_model = pickle.load(tfidf_model_file)  

# Load the Random Forest model
with open('src/random_forest_fitted_gridsearch.pkl', 'rb') as random_forest_model_file:
    rf_model = pickle.load(random_forest_model_file)  
    
# Load the SVM Regression model
with open('src/svm_fitted_gridsearch.pkl', 'rb') as svm_model_file:
    svm_model = pickle.load(svm_model_file)
    
# Load custom stopwords
with open('src/custom_stopwords.pkl', 'rb') as stopwords_file:
    stopwords = pickle.load(stopwords_file)

# vect,lr = joblib.load("combined.pkl")

In [5]:
## Vectorizers
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
import nltk

# Downloads
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

# Other nltk modules
from nltk.corpus import stopwords
import nltk as nlp
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import words

import pickle

# Define  customized stopwords
stopwords = stopwords.words('english')
stopwords.extend(["rt", "u", "r", "amp", "w", "th"])  # Add additional stopwords

# remove some stopwords
stopwords.remove('not')
stopwords.remove('is')
stopwords.remove('against')
stopwords.remove("don't")
stopwords.remove("have")
stopwords.remove("won't")

len(stopwords)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\erick\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\erick\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [7]:
import re

def clean_tweets(tweet):
    # Create a lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Replace "n't" with "not"
    tweet = tweet.replace("n't", " not")

    # Remove @ sign and all handles
    tweet = re.sub("@[A-Za-z0-9]+", "", tweet)

    # Remove hashtags and words that come together with hashtags
    tweet = re.sub(r'#\w+', '', tweet)

    # Remove all punctuation. It also removes hashtags.
    tweet = re.sub("[!@$%^&*()_+\|/?,.:;'`’-]+", " ", tweet)

    # Remove links
    tweet = re.sub(r'http\S+', '', tweet)

    # Lowercase all characters
    tweet = tweet.lower()

    # Remove extra whitespace
    tweet = " ".join(tweet.split())

    # Remove stopwords
    tweet = ' '.join([word for word in tweet.split() if word.lower() not in stopwords])

    # Lemmatization
    tokens = nltk.word_tokenize(tweet)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tweet = " ".join(lemmatized_tokens)

    # Remove three dots from the final word, preserving the word itself
    tweet = re.sub(r'(\S+)\.{2,}$', r'\1', tweet)
    
    return tweet

In [18]:
# Text Blob
!pip install textblob
from textblob import TextBlob
from textblob import Word

# Sentiment classification function
def classify_sentiment(polarity):
    if polarity > 0.15:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





In [19]:
import pandas as pd
import numpy as np

In [101]:
# Tweet analysis function

def tweet_analysis(tweet, vectorizer=fitted_vectorizer, model=logistic_model):
    # Clean the tweet
    cleaned_tweet = clean_tweets(tweet)
    
    # Vectorize the tweet
    vectorized_tweet = vectorizer.transform([cleaned_tweet])
    
    # Transforming the testing data into a DataFrame with feature names as columns
    transformed_tweet = pd.DataFrame(vectorized_tweet.toarray(), columns=vectorizer.get_feature_names_out())
    
    # Run Logistic Model
    prediction_prob = model.predict_proba(transformed_tweet)[0, 1]  # Probability of being Democrat
    
   # Determine party affiliation based on threshold
    if prediction_prob > 0.511:
        party_affiliation = 'Democrat'
    elif prediction_prob < 0.491:
        party_affiliation = 'Republican'
    else:
        party_affiliation = 'Neutral'
    
    # Sentiment analysis
    sentiment_polarity = TextBlob(cleaned_tweet).sentiment.polarity
    sentiment_label = classify_sentiment(sentiment_polarity)
    
    results = {
        'tweet': tweet,
        'cleaned_tweet': cleaned_tweet,
        'party_affiliation': party_affiliation,
        'tweet_sentiment': sentiment_label
    }
    
    return f"tweet: {results['tweet']}\nCleaned Tweet: {results['cleaned_tweet']}\nParty Affiliation: {results['party_affiliation']}\nTweet Sentiment: {results['tweet_sentiment']}\nPredict proba: {prediction_prob}"


In [103]:
# Test results

# Choose between some of the following vectorizers and model. Otherwise will default to logistic_model and fitted_vectorizer
# vectorizer_list = [fitted_vectorizer, tfidf_fitted_vectorizer]
# model_list = [logistic_model, fitted_logistic_model, tfidf_logistic_model, rf_model, svm_model]


results = tweet_analysis("I support border protection", vectorizer=fitted_vectorizer, model=logistic_model)
print(results)

tweet: I support border protection
Cleaned Tweet: support border protection
Party Affiliation: Neutral
Tweet Sentiment: Neutral
Predict proba: 0.49756072097819337


