In [199]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.calibration import LabelEncoder

In [200]:
labeled_df = pd.read_csv('../data/labeled.csv')
labeled_df.head()

Unnamed: 0,id,text,cleaned_text_with_stop_words_removal,label
0,1807053896587919743,We should consider dismantling provincial stru...,consid dismantl provinci structur empow local ...,neutral
1,1807031275989114981,Those joining in insurance online will be requ...,join insur onlin requir provid nation ident ca...,positive
2,1807030978554269711,Minister Aryal also requested the provincial a...,minist aryal also request provinci local gover...,positive
3,1807029744837300466,\xf0\x9f\x8c\xbe Happy National Paddy Day (Aas...,xf0x9fx8cxbe happi nation paddi day aasar 15 x...,positive
4,1806998724926755034,Climate action plans and targets set by the Ne...,climat action plan target set nepal govern gui...,positive


In [201]:
labeled_df.isnull().sum()
null_rows = labeled_df[labeled_df['cleaned_text_with_stop_words_removal'].isnull()]
print("Rows with null values in 'cleaned_text_with_stop_words_removal':")
print(null_rows)
labeled_df.dropna(subset=['cleaned_text_with_stop_words_removal'], inplace=True)
duplicates_text = labeled_df.duplicated(subset=['cleaned_text_with_stop_words_removal'])
labeled_df[duplicates_text]

Rows with null values in 'cleaned_text_with_stop_words_removal':
                       id                                               text  \
7913  1149606322318401536  @SudeepS1402 @rsansar @katesictibet @LokAawaaz...   

     cleaned_text_with_stop_words_removal    label  
7913                                  NaN  neutral  


Unnamed: 0,id,text,cleaned_text_with_stop_words_removal,label


In [202]:
def trainSvc(X_train, X_test, y_train, y_test):
    # Train SVM model
    svm_model = SVC(C=10, class_weight='balanced')
    svm_model.fit(X_train, y_train)

    # Evaluate SVM model
    svm_predictions = svm_model.predict(X_test)
    print("SVM Model Performance:")
    print(classification_report(y_test, svm_predictions))

def trainNaive(X_train, X_test, y_train, y_test):
    # Train Naive Bayes model
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)

    # Evaluate Naive Bayes model
    nb_predictions = nb_model.predict(X_test)
    print("Naive Bayes Model Performance:")
    print(classification_report(y_test, nb_predictions))

In [203]:
# Initialize TF-IDF Vectorizer


def Vectorize(text):

    tfidf = TfidfVectorizer(max_features=5000,ngram_range=(1,1))

    # Fit and transform the labeled data
    return tfidf.fit_transform(text)
     

In [204]:
# countpositive = 0
# countnegative=0
# countneutral=0
# for data in labeled_df['label']:
#     # print(data)
#     if data == 2:
#         countpositive = countpositive +1
#     elif data == 0:
#         countnegative = countnegative +1
#     else:
#         countneutral = countneutral +1


# print(countpositive,countnegative,countneutral)

In [205]:
X_labeled = Vectorize(labeled_df['cleaned_text_with_stop_words_removal'])
label_encoder = LabelEncoder()
y_labeled = label_encoder.fit_transform(labeled_df['label'])


# Split the labeled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, stratify= y_labeled,random_state=42)
# print("X_train:", X_train)
# print("X_test:", X_test)
# print("y_train:", y_train)
# print("y_test:", y_test)


trainSvc(X_train, X_test, y_train, y_test)
trainNaive(X_train, X_test, y_train, y_test)


SVM Model Performance:
              precision    recall  f1-score   support

           0       0.75      0.76      0.75       653
           1       0.69      0.54      0.60       324
           2       0.74      0.80      0.77       711

    accuracy                           0.74      1688
   macro avg       0.73      0.70      0.71      1688
weighted avg       0.73      0.74      0.73      1688

Naive Bayes Model Performance:
              precision    recall  f1-score   support

           0       0.71      0.72      0.72       653
           1       0.87      0.10      0.19       324
           2       0.62      0.85      0.72       711

    accuracy                           0.66      1688
   macro avg       0.73      0.56      0.54      1688
weighted avg       0.70      0.66      0.61      1688

