In [1]:
import sys
sys.path.append('../scripts')
from helpers import *
from preprocessing import *

In [2]:
base_path = '../data/raw_data/'
df_train = read_file(base_path + 'twitter_training.csv')
df_test = read_file(base_path + 'twitter_test.csv')
df_validation = read_file(base_path + 'twitter_validation.csv')
df_train.shape, df_test.shape, df_validation.shape

((74682, 4), (500, 4), (500, 4))

## Drop nan values (before preprocessing)

In [3]:
# Drop rows with missing values in the 'content' column
df_train = df_train.dropna(subset=['Tweet content'])
df_train.shape

(73996, 4)

## Preprocessing tweet content
- Handle Mentions (@mentions)
    - User mentions often don't contribute to sentiment analysis and can be removed or replaced. This step ensures that mentions (e.g., "@user123") don't affect the analysis.
- Handle Hashtags (#hashtags)
    - Extract information from hashtags or remove them. Hashtags can be valuable in sentiment analysis as they often reflect the main theme or topic of a tweet.
- Remove URLs
    - URLs don't usually convey sentiment and can be removed to focus on the text content.
- Handle Emoticons and Emoji
    -  Emoticons and emojis add emotional context to text. This step involves converting them to a standardized format.
- Remove Retweet Tags (RT)
    - "RT" tags indicate a retweet, which might not contribute to sentiment analysis. Removing these tags ensures that the model focuses on the original content.
- Handle Special Characters
    - Special characters may not carry sentiment-related information and can be converted or removed to maintain text clarity.
- Handle Contractions and Slang
    - Expanding contractions and replacing slang ensures consistency in the language used, helping the model better understand and analyze sentiment
- Tokenization with Twitter-Specific Tokenizers
    - Tokenization involves breaking down the tweet into individual words or subwords. Twitter-specific tokenizers consider the unique characteristics of Twitter content, such as the use of hashtags and mentions.
- Filtering Short Words
    - Tokenization involves breaking down the tweet into individual words or subwords. Twitter-specific tokenizers consider the unique characteristics of Twitter content, such as the use of hashtags and mentions.
- Removing Stopwords
    - Stopwords are common words (e.g., "and," "the," "is") that do not carry significant meaning. Removing them reduces the dimensionality of the data and can improve the efficiency of sentiment analysis.
- Stemming
    - Stemming involves reducing words to their root or base form. This helps in treating similar words with different inflections as the same, reducing the complexity of the data.
- lemmatizing
    - Similar to stemming, lemmatizing reduces words to a valid base form, considering the context. It helps in maintaining the integrity of words in the English language.
- Handling Numeric Data
    - Numeric data not be directly related to sentiment

In [4]:
df_train['Preprocessed Tweet content'] = df_train['Tweet content'].apply(preprocess_tweet)
df_test['Preprocessed Tweet content'] = df_test['Tweet content'].apply(preprocess_tweet)
df_validation['Preprocessed Tweet content'] = df_validation['Tweet content'].apply(preprocess_tweet)

In [7]:
path = '../data/processed_data/'
save_to_csv(df_train, path + 'preprocessed_training_tweets.csv')
save_to_csv(df_test, path + 'preprocessed_test_tweets.csv')
save_to_csv(df_validation, path + 'preprocessed_validation_tweets.csv')