In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec


In [2]:
def preprocess_text(text):
    tokens = word_tokenize(text)

    tokens = [token.lower() for token in tokens]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    tokens = [token for token in tokens if token not in string.punctuation]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [3]:
with open('../data/labelled_data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)


In [4]:
for item in data:
    item['text_content'] = preprocess_text(item['text_content'])

In [5]:
with open('../data/updated_json_file.json', 'w') as file:
    json.dump(data, file, indent=2)

In [6]:
with open('../data/updated_json_file.json', 'r') as file:
    data = json.load(file)

In [7]:
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,text_content,sentiment,sentiment_score
0,"[peopl, go, keep, buy, crap, alway, key, word,...",negative,-0.4588
1,"[better, way, convinc, american, support, caus...",positive,0.2028
2,"[context, vivek, immedi, jump, terrorist, atta...",negative,-0.8316
3,"[know, ’, 😂, wonder, go, win, south, carolina,...",positive,0.5859
4,"[tucker, carlson, interview, santiago, abasc, ...",negative,-0.2500
...,...,...,...
405,"[messag, china, clear, era, appeas, halt, flow...",positive,0.9815
406,"[alex, sheppard, perhap, alreadi, know, sent, ...",negative,-0.2732
407,"[ike, amaz, manag, get, fire, anoth, step, lef...",positive,0.0516
408,"[😑, anyon, surpris]",positive,0.2263


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert lists of tokens to space-separated strings
df['text_content_str'] = df['text_content'].apply(lambda x: ' '.join(x))
df


Unnamed: 0,text_content,sentiment,sentiment_score,text_content_str
0,"[peopl, go, keep, buy, crap, alway, key, word,...",negative,-0.4588,peopl go keep buy crap alway key word “ far ri...
1,"[better, way, convinc, american, support, caus...",positive,0.2028,better way convinc american support caus distu...
2,"[context, vivek, immedi, jump, terrorist, atta...",negative,-0.8316,context vivek immedi jump terrorist attack chr...
3,"[know, ’, 😂, wonder, go, win, south, carolina,...",positive,0.5859,know ’ 😂 wonder go win south carolina primari 😂😂
4,"[tucker, carlson, interview, santiago, abasc, ...",negative,-0.2500,tucker carlson interview santiago abasc vox pa...
...,...,...,...,...
405,"[messag, china, clear, era, appeas, halt, flow...",positive,0.9815,messag china clear era appeas halt flow americ...
406,"[alex, sheppard, perhap, alreadi, know, sent, ...",negative,-0.2732,alex sheppard perhap alreadi know sent jail 19...
407,"[ike, amaz, manag, get, fire, anoth, step, lef...",positive,0.0516,ike amaz manag get fire anoth step left ’ plan...
408,"[😑, anyon, surpris]",positive,0.2263,😑 anyon surpris


In [10]:
# Bag-of-Words (BoW) representation
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(df['text_content_str'])
print("Bag-of-Words features shape:", bow_features.shape)

Bag-of-Words features shape: (410, 866)


In [11]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(df['text_content_str'])
print("TF-IDF features shape:", tfidf_features.shape)


TF-IDF features shape: (410, 866)


In [12]:
tokenized_text = df['text_content_str'].apply(word_tokenize)
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)
word2vec_features = word2vec_model.wv.vectors
print("Word2Vec features shape:", word2vec_features.shape)


Word2Vec features shape: (875, 100)


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder


In [14]:
X = df['text_content_str']
y = df['sentiment']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [17]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [51]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:")
for label, encoded_label in label_mapping.items():
    print(f"{label} --> {encoded_label}")

Label Mapping:
negative --> 0
neutral --> 1
positive --> 2


In [18]:
models = [SVC(kernel='linear', random_state=42),XGBClassifier(),LogisticRegression(), RandomForestClassifier(),MultinomialNB()]
accuracy=[]
class_reports =[]
for model in models:
    model.fit(X_train_tfidf, y_train_encoded)

    y_pred = model.predict(X_test_tfidf)

    acc = accuracy_score(y_test_encoded, y_pred)
    class_rep = classification_report(y_test_encoded, y_pred)
    accuracy.append(acc)
    class_reports.append(class_rep)

    print(f'{model.__class__.__name__} Accuracy: {acc:.2f}')
    print(f'{model.__class__.__name__} Classification Report:\n {class_rep}\n \n')


SVC Accuracy: 1.00
SVC Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       1.00      1.00      1.00        20
           2       1.00      1.00      1.00        35

    accuracy                           1.00        82
   macro avg       1.00      1.00      1.00        82
weighted avg       1.00      1.00      1.00        82

 

XGBClassifier Accuracy: 1.00
XGBClassifier Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       1.00      1.00      1.00        20
           2       1.00      1.00      1.00        35

    accuracy                           1.00        82
   macro avg       1.00      1.00      1.00        82
weighted avg       1.00      1.00      1.00        82

 

LogisticRegression Accuracy: 1.00
LogisticRegression Classification Report:
               precision    recall  f1-sco

In [33]:
from sklearn.model_selection import GridSearchCV
# Define the parameter grid for SVC
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear']}

# Create SVC model
svc_model = SVC(random_state=42)

# Apply GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(svc_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train_encoded)

# Get the best model from the grid search
best_svc_model = grid_search.best_estimator_

In [34]:
best_svc_model

In [35]:
best_svc_model = grid_search.best_estimator_


In [36]:
y_pred = best_svc_model.predict(X_test_tfidf)

# Evaluate the model
acc = accuracy_score(y_test_encoded, y_pred)
class_rep = classification_report(y_test_encoded, y_pred)

In [52]:
print(class_rep)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       1.00      1.00      1.00        20
           2       1.00      1.00      1.00        35

    accuracy                           1.00        82
   macro avg       1.00      1.00      1.00        82
weighted avg       1.00      1.00      1.00        82



In [20]:
import joblib

joblib.dump(svc_model, '../data/svc_model.joblib')


['../data/svc_model.joblib']

In [39]:
preprocessed_new_data = preprocess_text("If he was to reproduce at all it would be asexually in a puddle of toxic bile, regurgitating some pure fucking homunculus of everything that is wrong with the world.")

In [40]:
new_data_tfidf = tfidf_vectorizer.transform(preprocessed_new_data)


In [41]:
predictions = best_svc_model.predict(new_data_tfidf)


In [42]:
predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [43]:
sentiment_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}


In [44]:
predicted_labels = [sentiment_mapping[prediction] for prediction in predictions]


In [45]:
predicted_labels

['neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral']

In [46]:
from collections import Counter

In [47]:
label_counts = Counter(predicted_labels)


In [48]:
total_count = len(predicted_labels)
average_sentiment = sum([count * index for index, count in enumerate(label_counts.values())]) / total_count


In [49]:
print("Label Counts:", label_counts)
print("Average Sentiment:", average_sentiment)

Label Counts: Counter({'neutral': 13})
Average Sentiment: 0.0
