In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abdul\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Load the required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Load the dataset
data = pd.read_csv('dataset/dataset.csv')
data.columns

Index(['Text', 'Language', 'Label'], dtype='object')

In [5]:
data = data[data['Language'] == 'en']
data = data.iloc[0:10000]

X = data['Text']
y = data['Label']

In [6]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
encoder = LabelEncoder()

# Fit and transform a specific column
y_encoded = encoder.fit_transform(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [8]:
# Preprocess the text data and extract features using TF-IDF
vectorizer = TfidfVectorizer(max_features=500)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
# Train the SVM model
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test_tfidf)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.964
Precision: 0.9643213154902537
Recall: 0.964
F1 Score: 0.964019045065961


In [11]:
# Perform POS tagging on the text data
def pos_tagging(text):
    tokens = nltk.word_tokenize(text)
    return [pos for _, pos in nltk.pos_tag(tokens)]

X_train_pos = [pos_tagging(tweet_text) for tweet_text in X_train]
X_test_pos = [pos_tagging(tweet_text) for tweet_text in X_test]

In [12]:
# Combine the text data and POS tags
X_train_combined = []
for i in range(len(X_train)):
    combined = list(X_train)[i] + " " + " ".join(X_train_pos[i])
    X_train_combined.append(combined)

X_test_combined = []
for i in range(len(X_test)):
    combined = list(X_test)[i] + " " + " ".join(X_test_pos[i])
    X_test_combined.append(combined)

In [13]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train_combined)
X_test_tfidf = vectorizer.transform(X_test_combined)

In [14]:
# Train the SVM model
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test_tfidf)

In [15]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.955
Precision: 0.9554770614491291
Recall: 0.955
F1 Score: 0.9550683164510744


In [16]:
import spacy

# Load the English language model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Process the dataset using the spaCy language model
def morphology_processing(text):
    processed_data = []
    for review in text:
        doc = nlp(review)
        lemma_list = [token.lemma_ for token in doc if not token.is_stop]
        pos_list = [token.pos_ for token in doc if not token.is_stop]
        processed_review = " ".join([lemma + "_" + pos for lemma, pos in zip(lemma_list, pos_list)])
        processed_data.append(processed_review)
    return processed_data

processed_train_data = morphology_processing(X_train)
processed_test_data = morphology_processing(X_test)

In [None]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(processed_train_data)
X_test_tfidf = vectorizer.transform(processed_test_data)

In [None]:
# Train the SVM model
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test_tfidf)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)