<a href="https://colab.research.google.com/github/dqminhv/fellowship_ai-nlp-challenge/blob/main/notebook/nlp_challenge_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Required Packages & Load Data

In [1]:
#Ignore warnings in Google Colab
import warnings
# Ignore all warnings (not recommended in general)
warnings.filterwarnings("ignore")

In [2]:
#Mount Google Drive to access the IMDB Review file
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
#Import required packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Data Preprocessing & Features Engineering

In [9]:
#Load csv file to a pandas DataFrame
file_path = '/content/gdrive/MyDrive/Document/Data Science/Skills Development/fellowshipai/imdb-cleaned.csv'
imdb_cleaned = pd.read_csv(file_path)

In [11]:
#Stemming the review column
stemmer = PorterStemmer()
imdb_cleaned['stemmed_review'] = imdb_cleaned['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [12]:
#TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=2, max_df=.8, ngram_range=(1,2))
X_review = tfidf_vectorizer.fit_transform(imdb_cleaned['stemmed_review'])

In [13]:
#Convert TF-IDF matrix to a DataFrame
X = pd.DataFrame(X_review.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [14]:
#Convert values in sentiment column to binary
imdb_cleaned['sentiment'] = imdb_cleaned['sentiment'].map({'positive': 1, 'negative': 0})

In [15]:
y = imdb_cleaned['sentiment']

# Exporting Cleaned Data for Modeling

In [18]:
X_file_path = '/content/gdrive/MyDrive/Document/Data Science/Skills Development/fellowshipai/X.csv'
X.to_csv(file_path, index=False)

In [19]:
y_file_path = '/content/gdrive/MyDrive/Document/Data Science/Skills Development/fellowshipai/y.csv'
y.to_csv(file_path, index=False)