In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load the dataset
df = pd.read_csv('train.csv')

print("Dataset loaded successfully.")
display(df.head())

Dataset loaded successfully.


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
# Create a new 'target' column. 1 if any toxicity is present, 0 otherwise.
# This simplifies our multi-label problem into a binary classification problem.
df['target'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)

# We only need the comment text and our new target column
df_filtered = df[['comment_text', 'target']].copy()

print("Target column created.")
display(df_filtered.head())

Target column created.


Unnamed: 0,comment_text,target
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [3]:
ps = PorterStemmer()

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = [i for i in text if i.isalnum()]
    text = [i for i in y if i not in stopwords.words('english') and i not in string.punctuation]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

print("Text transformation function is ready.")

Text transformation function is ready.


In [6]:
# Replace your old Cell 3 with this one

from nltk.stem.snowball import SnowballStemmer
import string
import nltk

# Download necessary NLTK data if you haven't already
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Initialize the SnowballStemmer for English
stemmer = SnowballStemmer('english')

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = [i for i in text if i.isalnum()]
    
    text = [i for i in y if i not in stopwords.words('english') and i not in string.punctuation]
    
    y.clear()
    for i in text:
        # Use the new, better stemmer
        y.append(stemmer.stem(i))
        
    return " ".join(y)

print("Text transformation function is ready (upgraded to SnowballStemmer).")

Text transformation function is ready (upgraded to SnowballStemmer).


In [8]:
# Apply the function to the 'comment_text' column
df_filtered['transformed_text'] = df_filtered['comment_text'].apply(transform_text)

print("Text preprocessing complete. This was the longest step!")
display(df_filtered.head())

Text preprocessing complete. This was the longest step!


Unnamed: 0,comment_text,target,transformed_text
0,Explanation\nWhy the edits made under my usern...,0,explan edit made usernam hardcor metallica fan...
1,D'aww! He matches this background colour I'm s...,0,match background colour seem stuck thank talk ...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man realli tri edit war guy constant remov...
3,"""\nMore\nI can't make any real suggestions on ...",0,ca make real suggest improv wonder section sta...
4,"You, sir, are my hero. Any chance you remember...",0,sir hero chanc rememb page


In [9]:
# Define features (X) and target (y)
X = df_filtered['transformed_text']
y = df_filtered['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text using TF-IDF
tfidf_toxic = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_toxic.fit_transform(X_train)
X_test_tfidf = tfidf_toxic.transform(X_test)

# Train the Logistic Regression model
model_toxic = LogisticRegression(solver='liblinear', random_state=42)
model_toxic.fit(X_train_tfidf, y_train)

# (Optional) Check the model's performance on the test data
y_pred = model_toxic.predict(X_test_tfidf)
print(f"Toxic Comment Model Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Toxic Comment Model Accuracy: 0.9585


In [10]:
# Save the fitted TF-IDF vectorizer
pickle.dump(tfidf_toxic, open('toxic_vectorizer.pkl', 'wb'))

# Save the trained Logistic Regression model
pickle.dump(model_toxic, open('toxic_model.pkl', 'wb'))

print("\nSuccessfully saved 'toxic_vectorizer.pkl' and 'toxic_model.pkl'!")


Successfully saved 'toxic_vectorizer.pkl' and 'toxic_model.pkl'!
