In [31]:
import json
import csv
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer

## TODO: Change filename
data = '/filename/classifierdata_raw.csv'
df = pd.read_csv(data)

In [32]:
import warnings
warnings.filterwarnings('ignore')

# Extracting our Relevant Columns

Recall that in `Scraping.ipynb`, our dataframe had several columns from the original dataset (`category`, `link`, etc.) that are no longer relevant to us, now that we are in the modeling stage. The only two columns we'll need at this point are the `class` (0 or 1) and `text` columns.

In [33]:
classifier_df = df[['class', 'text']]

In [34]:
classifier_df

Unnamed: 0,class,text
0,0,Over 4 Million Americans Roll Up Sleeves For O...
1,0,"American Airlines Flyer Charged, Banned For Li..."
2,0,23 Of The Funniest Tweets About Cats And Dogs ...
3,0,The Funniest Tweets From Parents This Week (Se...
4,0,Woman Who Called Cops On Black Bird-Watcher Lo...
...,...,...
11831,1,Mike Pence Accused Of Hypocrisy Over Tribute T...
11832,0,6 Fired At Howard University For Misconduct In...
11833,0,Imagine Dragons Singer Makes Plea For LGBTQ Eq...
11834,1,White House Hires Former Disney Channel Star A...


# Preprocessing

This is where all of the actual preprocessing happens. Standardizing casing, removing punctuation and whitespace, etc.

In [35]:
classifier_df['text'] = classifier_df['text'].str.lower()
classifier_df['text'] = classifier_df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
classifier_df['text'] = classifier_df['text'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))

In [36]:
classifier_df.head()

Unnamed: 0,class,text
0,0,over 4 million americans roll up sleeves for o...
1,0,american airlines flyer charged banned for lif...
2,0,23 of the funniest tweets about cats and dogs ...
3,0,the funniest tweets from parents this week sep...
4,0,woman who called cops on black birdwatcher los...


# Lemmatization

In [37]:
# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    # Tokenize text into words
    words = nltk.word_tokenize(text)
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Join lemmatized words back into a string
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

# Apply lemmatization to the 'text' column
classifier_df['lemmatized_text'] = classifier_df['text'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dianakazarian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
classifier_df.head()

Unnamed: 0,class,text,lemmatized_text
0,0,over 4 million americans roll up sleeves for o...,over 4 million american roll up sleeve for omi...
1,0,american airlines flyer charged banned for lif...,american airline flyer charged banned for life...
2,0,23 of the funniest tweets about cats and dogs ...,23 of the funniest tweet about cat and dog thi...
3,0,the funniest tweets from parents this week sep...,the funniest tweet from parent this week sept ...
4,0,woman who called cops on black birdwatcher los...,woman who called cop on black birdwatcher lose...


In [42]:
classifier_df = classifier_df[['class', 'lemmatized_text']].rename(columns={'lemmatized_text': 'text'})

In [44]:
classifier_df.head()

Unnamed: 0,class,text
0,0,over 4 million american roll up sleeve for omi...
1,0,american airline flyer charged banned for life...
2,0,23 of the funniest tweet about cat and dog thi...
3,0,the funniest tweet from parent this week sept ...
4,0,woman who called cop on black birdwatcher lose...


In [45]:
## TODO: Change filename
classifier_df.to_csv('/filename/classifierdata_processed.csv', index=False)