## Assignment 3

### Perform text cleaning, perform lemmatization (any method), remove stop words (any method), label encoding. Create representations using TF-IDF. Save outputs

In [1]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
data = {
    "text": [
        "I love machine learning",
        "Machine learning is very interesting",
        "I hate boring lectures",
        "This course is useless"
    ],
    "label": ["Positive", "Positive", "Negative", "Negative"]
}

df = pd.DataFrame(data)
print(df)

                                   text     label
0               I love machine learning  Positive
1  Machine learning is very interesting  Positive
2                I hate boring lectures  Negative
3                This course is useless  Negative


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)
print(df["clean_text"])

0                 i love machine learning
1    machine learning is very interesting
2                  i hate boring lectures
3                  this course is useless
Name: clean_text, dtype: object


In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

df["processed_text"] = df["clean_text"].apply(preprocess_text)
print(df["processed_text"])

0           love machine learning
1    machine learning interesting
2             hate boring lecture
3                  course useless
Name: processed_text, dtype: object


In [6]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

print(df[["label", "label_encoded"]])

      label  label_encoded
0  Positive              1
1  Positive              1
2  Negative              0
3  Negative              0


In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["processed_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print(tfidf_df)

    boring    course     hate  interesting  learning  lecture      love  \
0  0.00000  0.000000  0.00000     0.000000  0.526405  0.00000  0.667679   
1  0.00000  0.000000  0.00000     0.667679  0.526405  0.00000  0.000000   
2  0.57735  0.000000  0.57735     0.000000  0.000000  0.57735  0.000000   
3  0.00000  0.707107  0.00000     0.000000  0.000000  0.00000  0.000000   

    machine   useless  
0  0.526405  0.000000  
1  0.526405  0.000000  
2  0.000000  0.000000  
3  0.000000  0.707107  


In [8]:
df.to_csv("cleaned_preprocessed_data.csv", index=False)

In [9]:
tfidf_df.to_csv("tfidf_features.csv", index=False)

In [10]:
label_mapping = pd.DataFrame({
    "label": label_encoder.classes_,
    "encoded_value": range(len(label_encoder.classes_))
})

label_mapping.to_csv("label_mapping.csv", index=False)