# Clean the aug_emotion_data

## Import Libraries

In [5]:
import pandas as pd
import nltk ,re, string

from nltk.tokenize import word_tokenize

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Honor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read the dataFrame

In [6]:
df = pd.read_csv("aug_emotion_data.csv", index_col= 0)

In [7]:
df.head()

Unnamed: 0,clean_content,sentiment_id
0,RT Your anxiety might have represented a crush...,0
1,So not pumped for this interview nervous,0
2,so my boss told me today that I am on vacation...,4
3,could feel humiliated bn detachment realized n...,2
4,Even in the 1930s the anti Semitism of literar...,0


# Cleaning the Data

### Delete duplicate data

In [None]:
df.shape

In [None]:
df.drop_duplicates(inplace=True)
df.shape

In [7]:
# take the first 200k rows
# df = df.head(10000)
# df.shape

### Text Preprocessing

- Lowercasing: Convert all text to lowercase to ensure case insensitivity.
- Tokenization: Split the text into individual words or tokens.
- Removing Punctuation: Remove any punctuation marks from the text.
- Stop Word Removal: Remove common words (e.g., "the," "is," "and") that do not contribute much to the topic modeling process.
- Lemmatization or Stemming: Reduce words to their base or root form to consolidate variations of the same word.

In [9]:
# Tokennisation
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply the tokenize_text() function to the 'content' column
token_content = df['clean_content'].apply(tokenize_text)

In [29]:
#remove stop word + any special characters + lemmatize the tokens
def remove_noise(tokens, stop_words = ()):
    cleaned_tokens = []

    for token, tag in pos_tag(tokens):      
        # Removing Punctuation
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        # Removing special characters and mathematical symbols
        token = re.sub('[^a-zA-Z0-9\s]|(\$.+?\$)', '', token)
        # remove digit words
        token = re.sub(r'\b\d+\b', '', token)
        # remove words starts with a digit
        token = re.sub(r'\b\d\w*\b', '', token)

        # Lemmatization
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        
        # check if the string is not empty or a ponctuation and doesn't exist in the stop words:
        if len(token) > 0 and token not in string.punctuation and (token.lower() not in stop_words):
            # Lowercasing
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [13]:
# apply the remove noice function: 
clean_content = token_content.apply(lambda x: remove_noise(x, stop_words))

In [14]:
df['clean_content'] = clean_content.apply(lambda x: ' '.join(x))

In [28]:
df.isnull().sum()

clean_content    0
sentiment_id     0
dtype: int64

## Save Pandas DataFrames

In [29]:
# Save the dataframe to a CSV file
df.to_csv('aug_emotion_clean_data_v2.csv', sep=',', index=False)

In [30]:
dff = pd.read_csv("aug_emotion_clean_data_v2.csv")

In [31]:
dff.head()

Unnamed: 0,clean_content,sentiment_id
0,rt anxiety might represent crushing faith char...,0
1,pump interview nervous,0
2,bos tell today vacation next week notion anoth...,4
3,could feel humiliated bn detachment realize ne...,2
4,even anti semitism literary figure like hilair...,0


In [32]:
dff.shape

(522333, 2)

In [33]:
missing_counts = dff.isnull().sum()
missing_counts

clean_content    153
sentiment_id       0
dtype: int64

In [31]:
# Drop NAN values
dff = dff.dropna()

In [32]:
dff.isnull().sum()

clean_content    0
sentiment_id     0
dtype: int64