## Dataset download

In [18]:
import requests
import zipfile
import os

In [19]:
url = 'https://www.kaggle.com/api/v1/datasets/download/emirhanai/2024-u-s-election-sentiment-on-x'

response = requests.get(url)

In [20]:
if response.status_code == 200:
    with open('dataset.zip', 'wb') as file:
        file.write(response.content)
    print("Dataset saved as 'dataset.zip'")
else:
    print(f"Error downloading dataset: {response.status_code}")

Dataset saved as 'dataset.zip'


In [21]:
with zipfile.ZipFile('dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('dataset')
print("Dataset unpacked as 'dataset'")

Dataset unpacked as 'dataset'


In [22]:
files = os.listdir('dataset')
print("Dataset files:", files)

Dataset files: ['test.csv', 'train.csv', 'val.csv']


## Exploratory Data Analysis

In [95]:
import pandas as pd
import plotly.express as px

In [96]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')
df_val = pd.read_csv('dataset/val.csv')

print(f"df_train size: {df_train.shape[0]}")
print(f"df_test size: {df_test.shape[0]}")
print(f"df_val size: {df_val.shape[0]}")

df_train size: 500
df_test size: 50
df_val size: 50


In [97]:
df_train.head()

Unnamed: 0,tweet_id,user_handle,timestamp,tweet_text,candidate,party,retweets,likes,sentiment
0,1,@user123,2024-11-03 08:45:00,Excited to see Kamala Harris leading the Democ...,Kamala Harris,Democratic Party,120,450,positive
1,2,@politicsFan,2024-11-03 09:15:23,Donald Trump's policies are the best for our e...,Donald Trump,Republican Party,85,300,positive
2,3,@greenAdvocate,2024-11-03 10:05:45,Jill Stein's environmental plans are exactly w...,Jill Stein,Green Party,60,200,positive
3,4,@indieVoice,2024-11-03 11:20:10,Robert Kennedy offers a fresh perspective outs...,Robert Kennedy,Independent,40,150,neutral
4,5,@libertyLover,2024-11-03 12:35:55,Chase Oliver's libertarian stance promotes tru...,Chase Oliver,Libertarian Party,30,120,positive


In [99]:
fig = px.bar(df_train, x='party', y='likes')
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## Text Preprocessing

In [81]:
import nltk
import string
import re
import hunspell
# Make sure that python-dev and libhunspell-dev are installed.
#   $ sudo apt-get update
#   $ sudo apt-get install python-dev 
#   $ sudo apt-get install libhunspell-dev
#   $ sudo pip install hunspell

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

spell_checker = hunspell.HunSpell('dictionaries/index.dic', 'dictionaries/index.aff')

[nltk_data] Downloading package punkt_tab to /home/jose/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jose/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jose/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [91]:
def preprocess(input):
    # Whitespace removal
    text = input.strip()          # Leading and trailing
    text = " ".join(text.split()) # Remove multiplied whitespaces

    # URL removal
    pattern = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
    text = re.sub(pattern, "", text)

    # Tokenization (separating words into a list of tokens)
    tokens = nltk.word_tokenize(text)

    # Lowercasing (removing Uppercase letters)
    lowercased_tokens = [token.lower() for token in tokens]

    # Filtering punctuation
    filtered_tokens = [token for token in lowercased_tokens if token not in string.punctuation]

    # Stopword removal (removing words with little value such as 'the' 'of' etc.)
    stopwords = nltk.corpus.stopwords.words("english")
    filtered_tokens = [token for token in filtered_tokens if token.lower() not in stopwords]

    # Spelling correction (using the hunspell library)
    corrected_tokens = []
    for token in filtered_tokens:
            # Check if the word is misspelled
            if not spell_checker.spell(token):
                # Try to suggest corrections
                suggestions = spell_checker.suggest(token)
                if suggestions:
                    corrected_tokens.append(suggestions[0])  # Choose the first suggestion
                else:
                    corrected_tokens.append(token)  # No suggestions, keep the original token
            else:
                corrected_tokens.append(token)

    # Lemmatization (reducing words to their lemma form)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    processed_tokens = [lemmatizer.lemmatize(token) for token in corrected_tokens]

    return processed_tokens

example = "Naturalf languagde processiang is a field of artificial inteligence that deals with the interaction between computers and human (natural) language. Check out this article for more information: https://en.wikipedia.org/wiki/Natural_language_processing"
print(preprocess(example))

['natural', 'language', 'processing', 'field', 'artificial', 'intelligence', 'deal', 'interaction', 'computer', 'human', 'natural', 'language', 'check', 'article', 'information']
