## *Import Libraries*

In [1]:
# Load the required libraries
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

import nltk
nltk.download('averaged_perceptron_tagger')

C:\Users\abdul\anaconda3\envs\tf2x\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\abdul\anaconda3\envs\tf2x\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1108)>


False

## Read Dataset

In [2]:
# Load the dataset
train = pd.read_csv('dataset/Train.csv')
test = pd.read_csv('dataset/Test.csv')

In [3]:
train = train.iloc[0:5000]
X_train, y_train = train['text'], train['label']
X_test, y_test = test['text'], test['label']

## TF-IDF Vectorizer

In [4]:
# Preprocess the text data and extract features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## SVM Model
### Training and Testing

In [5]:
# Train the SVM model
model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test_tfidf)

### Evaluation

In [6]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.839
Precision: 0.8390035593622779
Recall: 0.839
F1 Score: 0.8390001352400486


## POS Tagging

In [7]:
# Perform POS tagging on the text data
def pos_tag(text):
    tokens = nltk.word_tokenize(text)
    return [pos for _, pos in nltk.pos_tag(tokens)]

X_train_pos = [pos_tag(text) for text in X_train]
X_test_pos = [pos_tag(text) for text in X_test]

In [8]:
# Combine the text data and POS tags
X_train_combined = []
for i in range(len(X_train)):
    combined = X_train[i] + " " + " ".join(X_train_pos[i])
    X_train_combined.append(combined)

X_test_combined = []
for i in range(len(X_test)):
    combined = X_test[i] + " " + " ".join(X_test_pos[i])
    X_test_combined.append(combined)

## TFIDF Vectorizer

In [9]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train_combined)
X_test_tfidf = vectorizer.transform(X_test_combined)

## SVM Model
### Training and Testing

In [10]:
# Train the SVM model
model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test_tfidf)

### Evaluation

In [11]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.837
Precision: 0.8372938208725734
Recall: 0.837
F1 Score: 0.8369584229842425


## Morphology Analyzer

In [12]:
# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Process the dataset using the spaCy language model
def morphology(text):
    processed_data = []
    for review in text:
        doc = nlp(review)
        lemma_list = [token.lemma_ for token in doc if not token.is_stop]
        pos_list = [token.pos_ for token in doc if not token.is_stop]
        processed_review = " ".join([lemma + "_" + pos for lemma, pos in zip(lemma_list, pos_list)])
        processed_data.append(processed_review)
    return processed_data

In [13]:
processed_train_data = morphology(X_train)
processed_test_data = morphology(X_test)

## TFIDF Vectorizer

In [14]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(processed_train_data)
X_test_tfidf = vectorizer.transform(processed_test_data)

## SVM Model
### Training and Testing

In [15]:
# Train the SVM model
model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test_tfidf)

## Evaluation

In [16]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8388
Precision: 0.8388124303690976
Recall: 0.8388
F1 Score: 0.8387996389117689
