In [1]:
import spacy
import gensim
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

In [2]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("train.csv")  

# Drop rows with missing tweets
df = df.dropna(subset=["tweet"])

# Function for batch processing with spaCy pipe
def preprocess_texts(texts):
    processed_texts = []
    for doc in nlp.pipe(texts, batch_size=100, disable=["ner", "parser"]):  # Disable unnecessary components
        tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        processed_texts.append(tokens)
    return processed_texts

# Apply optimized preprocessing
df["tokens"] = preprocess_texts(df["tweet"].astype(str))  



In [3]:
# Train Word2Vec Model
word2vec_model = Word2Vec(sentences=df["tokens"], vector_size=100, window=5, min_count=2, sg=1)
word2vec_model.save("word2vec.model")  # Save model


In [4]:
def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)  # Return zero vector if empty

df["vector"] = df["tokens"].apply(lambda x: get_sentence_vector(x, word2vec_model))


In [5]:
# Convert class column into one-hot encoding
df["hate_speech"] = (df["class"] == 0).astype(int)
df["offensive"] = (df["class"] == 1).astype(int)
df["neither"] = (df["class"] == 2).astype(int)

# Convert to NumPy arrays
X = np.vstack(df["vector"].values)  
y = df[["hate_speech", "offensive", "neither"]]  

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Train Logistic Regression Model
model = OneVsRestClassifier(LogisticRegression())
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


In [7]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.05      0.09       290
           1       0.91      0.94      0.93      3832
           2       0.77      0.71      0.74       835

   micro avg       0.88      0.85      0.87      4957
   macro avg       0.71      0.57      0.59      4957
weighted avg       0.86      0.85      0.85      4957
 samples avg       0.85      0.85      0.85      4957



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
from imblearn.over_sampling import RandomOverSampler

# Ensure X and y are extracted correctly from your DataFrame
X = df["tweet"]  # Extract tweets
y = df["class"]  # Extract class labels

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X.values.reshape(-1, 1), y)

# Convert X_resampled back into a list of tweets
X_resampled = X_resampled.flatten()


In [22]:
import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")  # 300-dimension pre-trained Word2Vec


In [23]:
import numpy as np

def tweet_to_vector(tweet, word2vec_model):
    words = tweet.split()  # Tokenize tweet
    word_vectors = [word2vec_model[word] for word in words if word in word2vec_model]

    # Average all word vectors in the tweet
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(300)  # 300-D Word2Vec vectors


In [24]:
X_resampled_vectors = np.array([tweet_to_vector(tweet, word2vec_model) for tweet in X_resampled])

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled_vectors, y_resampled, test_size=0.2, random_state=42)


In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [14]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.69      0.64      0.67      3849
           1       0.74      0.71      0.72      3794
           2       0.75      0.83      0.79      3871

    accuracy                           0.73     11514
   macro avg       0.73      0.73      0.73     11514
weighted avg       0.73      0.73      0.73     11514



In [25]:
import joblib

# Save the trained model
joblib.dump(model, "classifier_model.pkl")
print("Model saved successfully!")

Model saved successfully!
