In [36]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

nltk.download('stopwords')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Adding a column to represent the length of the tweet
train['len'] = train['tweet'].str.len()
test['len'] = test['tweet'].str.len()

train_corpus = []
for i in range(0, 31962):
    review = re.sub('[^a-zA-Z]', ' ', train['tweet'][i])
    review = review.lower()
    review = review.split()

    ps = PorterStemmer()

    # Stemming
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]

    # Joining them back with space
    review = ' '.join(review)
    train_corpus.append(review)

test_corpus = []
for i in range(0, 17197):
    review = re.sub('[^a-zA-Z]', ' ', test['tweet'][i])
    review = review.lower()
    review = review.split()

    ps = PorterStemmer()

    # Stemming
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]

    # Joining them back with space
    review = ' '.join(review)
    test_corpus.append(review)

cv = CountVectorizer(max_features=2500)
x = cv.fit_transform(train_corpus).toarray()
y = train.iloc[:, 1].astype(int)  # Assuming label column is at index 1 and converting to integers

print(x.shape)
print(y.shape)

cv = CountVectorizer(max_features=2500)
x_test = cv.fit_transform(test_corpus).toarray()

print(x_test.shape)

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.25, random_state=42)

print(x_train.shape)
print(x_valid.shape)
print(y_train.shape)
print(y_valid.shape)

# Train the decision tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

# Predict on the validation set
valid_predictions = dt_model.predict(x_valid)

# Print classification report
print(classification_report(y_valid, valid_predictions))

# Example sentence to test
sentence = "I love this !"

# Preprocess the sentence
processed_sentence = re.sub('[^a-zA-Z]', ' ', sentence)
processed_sentence = processed_sentence.lower().split()

ps = PorterStemmer()

# Stemming
processed_sentence = [ps.stem(word) for word in processed_sentence if not word in set(stopwords.words('english'))]
processed_sentence = ' '.join(processed_sentence)

# Transform the processed sentence using the same CountVectorizer
sentence_vector = cv.transform([processed_sentence]).toarray()

# Predict the class of the sentence
prediction = dt_model.predict(sentence_vector)

# Print the prediction
if prediction == 1:
    print("The sentence is classified as hate speech.")
else:
    print("The sentence is not classified as hate speech.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(31962, 2500)
(31962,)
(17197, 2500)
(23971, 2500)
(7991, 2500)
(23971,)
(7991,)
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      7432
           1       0.52      0.58      0.55       559

    accuracy                           0.93      7991
   macro avg       0.74      0.77      0.76      7991
weighted avg       0.94      0.93      0.93      7991

The sentence is not classified as hate speech.


In [None]:
test['label'] = prediction
output = test_data[['id','label']]
output.to_csv('output.csv', index=False)

# Using SVM

In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('train.csv')

# Preprocess the data
stemmer = PorterStemmer()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

corpus = []
for i in range(len(data)):
    tweet = re.sub('[^a-zA-Z]', ' ', data['tweet'][i])
    tweet = tweet.lower()
    tweet = tweet.split()
    stemmed_words = [stemmer.stem(word) for word in tweet if word not in stop_words]
    processed_tweet = ' '.join(stemmed_words)
    corpus.append(processed_tweet)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=2500)
X = vectorizer.fit_transform(corpus).toarray()
y = data['label']

# Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVM classifier
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Evaluate the model
valid_predictions = svm_model.predict(X_valid)
print(classification_report(y_valid, valid_predictions))

# Example sentence to test
sentence = "you are black!"

# Preprocess the sentence
processed_sentence = re.sub('[^a-zA-Z]', ' ', sentence)
processed_sentence = processed_sentence.lower().split()
stemmed_sentence = [stemmer.stem(word) for word in processed_sentence if word not in stop_words]
processed_sentence = ' '.join(stemmed_sentence)

# Transform the processed sentence using the same vectorizer
sentence_vector = vectorizer.transform([processed_sentence]).toarray()

# Predict the class of the sentence
prediction = svm_model.predict(sentence_vector)

# Print the prediction
if prediction[0] == 1:
    print("The sentence is classified as hate speech.")
else:
    print("The sentence is not classified as hate speech.")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5937
           1       0.93      0.45      0.60       456

    accuracy                           0.96      6393
   macro avg       0.94      0.72      0.79      6393
weighted avg       0.96      0.96      0.95      6393

The sentence is classified as hate speech.


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('train.csv')

# Preprocess the data
stemmer = PorterStemmer()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

corpus = []
for i in range(len(data)):
    tweet = re.sub('[^a-zA-Z]', ' ', data['tweet'][i])
    tweet = tweet.lower()
    tweet = tweet.split()
    stemmed_words = [stemmer.stem(word) for word in tweet if word not in stop_words]
    processed_tweet = ' '.join(stemmed_words)
    corpus.append(processed_tweet)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=2500)
X = vectorizer.fit_transform(corpus).toarray()
y = data['label']

# Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVM classifier
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Evaluate the model
valid_predictions = svm_model.predict(X_valid)
print(classification_report(y_valid, valid_predictions))

# Example sentence to test
sentence = "you are beautiful!"

# Preprocess the sentence
processed_sentence = re.sub('[^a-zA-Z]', ' ', sentence)
processed_sentence = processed_sentence.lower().split()
stemmed_sentence = [stemmer.stem(word) for word in processed_sentence if word not in stop_words]
processed_sentence = ' '.join(stemmed_sentence)

# Transform the processed sentence using the same vectorizer
sentence_vector = vectorizer.transform([processed_sentence]).toarray()

# Predict the class of the sentence
prediction = svm_model.predict(sentence_vector)

# Print the prediction
if prediction[0] == 1:
    print("The sentence is classified as hate speech.")
else:
    print("The sentence is not classified as hate speech.")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5937
           1       0.93      0.45      0.60       456

    accuracy                           0.96      6393
   macro avg       0.94      0.72      0.79      6393
weighted avg       0.96      0.96      0.95      6393

The sentence is not classified as hate speech.


In [9]:
test_data = pd.read_csv('test.csv')

# Preprocess the test data
test_corpus = []
for i in range(len(test_data)):
    test_tweet = re.sub('[^a-zA-Z]', ' ', test_data['tweet'][i])
    test_tweet = test_tweet.lower()
    test_tweet = test_tweet.split()
    test_stemmed_words = [stemmer.stem(word) for word in test_tweet if word not in stop_words]
    test_processed_tweet = ' '.join(test_stemmed_words)
    test_corpus.append(test_processed_tweet)

In [10]:
test_X = vectorizer.transform(test_corpus).toarray()

# Predict the labels for the test data
test_predictions = svm_model.predict(test_X)

# Assign the predicted labels to the test dataset
test_data['label'] = test_predictions

# Save the output to a CSV file
output = test_data[['id', 'label']]
output.to_csv('output.csv', index=False)

In [None]:
test_X = vectorizer.transform(test_corpus).toarray()

# Predict the labels for the test data
test_predictions = svm_model.predict(test_X)

# Assign the predicted labels to the test dataset
test_data['label'] = test_predictions

# Save the output to a CSV file
output1 = test_data[['id','tweet', 'label']]
output1.to_csv('output1.csv', index=False)