In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


file_path = '/content/draft_50000 (1).csv'
data = pd.read_csv(file_path)


print(data.head())
print(data.info())

data.dropna(inplace=True)
data['text'] = data['text'].str.lower()


label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.2, random_state=42)


   Unnamed: 0                                               text  class
0       51297  get crush guy definitely way old laugh literal...      0
1       24705   go to july 2018i hope 8 month enough change mind      1
2      185969  want live anymore 23 right thinking end life p...      1
3      201675  every time get period want kill already depres...      1
4       52701  story incomplete similar story op case dad spe...      1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  50000 non-null  int64 
 1   text        49989 non-null  object
 2   class       50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB
None


Feature Extraction using Count Vectorizer and Model Training using Logistic Regression

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()

X_train_cv = count_vectorizer.fit_transform(X_train)
X_test_cv = count_vectorizer.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


lr_model = LogisticRegression()


lr_model.fit(X_train_cv, y_train)

y_pred = lr_model.predict(X_test_cv)

print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.94      0.92      4935
           1       0.94      0.90      0.92      5063

    accuracy                           0.92      9998
   macro avg       0.92      0.92      0.92      9998
weighted avg       0.92      0.92      0.92      9998

Accuracy: 0.9212842568513703


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 Feature Extraction using Tfidf and Model Training using Logistic Regression

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()


X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

lr_model = LogisticRegression()

lr_model.fit(X_train_tfidf, y_train)

y_pred = lr_model.predict(X_test_tfidf)

print('Logistic Regression with TF-IDF')
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))



Logistic Regression with TF-IDF
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      4935
           1       0.94      0.92      0.93      5063

    accuracy                           0.93      9998
   macro avg       0.93      0.93      0.93      9998
weighted avg       0.93      0.93      0.93      9998

Accuracy: 0.9319863972794559


Feature Extraction using word2vec and Model Training using Logistic Regression

In [30]:
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import numpy as np

X_train_tokens = X_train.apply(gensim.utils.simple_preprocess)
X_test_tokens = X_test.apply(gensim.utils.simple_preprocess)

w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)


def document_vector(tokens, model):

    tokens = [token for token in tokens if token in model.wv.key_to_index]
    if len(tokens) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[tokens], axis=0)

X_train_w2v = np.array([document_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v = np.array([document_vector(tokens, w2v_model) for tokens in X_test_tokens])


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

lr_model = LogisticRegression()

lr_model.fit(X_train_w2v, y_train)

y_pred = lr_model.predict(X_test_w2v)

print('Logistic Regression with Word2Vec')
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


Logistic Regression with Word2Vec
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4935
           1       0.89      0.91      0.90      5063

    accuracy                           0.90      9998
   macro avg       0.90      0.90      0.90      9998
weighted avg       0.90      0.90      0.90      9998

Accuracy: 0.9002800560112022


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature Extraction using Glove and Model Training using Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from gensim.utils import simple_preprocess
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import zipfile
import os

# Load data
file_path = '/content/draft_50000 (1).csv'  # Update this path
data = pd.read_csv(file_path)

# Basic preprocessing
data.dropna(inplace=True)
data['text'] = data['text'].str.lower()

label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])


X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.2, random_state=42)

# Unzip GloVe file
glove_zip_path = '/content/glove.6B.zip'  # Update this path to the GloVe zip file
glove_txt_path = '/content/glove.6B.100d.txt'  # Change this if you're using a different dimension

if not os.path.exists(glove_txt_path):
    with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
        zip_ref.extract(glove_txt_path)

# Load GloVe vectors
def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings(glove_txt_path)

# Tokenize the text data
X_train_tokens = X_train.apply(simple_preprocess)
X_test_tokens = X_test.apply(simple_preprocess)

# Function to compute the average GloVe vector for a document
def document_vector(tokens, embeddings_index, vector_size):
    # Remove out-of-vocabulary words
    tokens = [token for token in tokens if token in embeddings_index]
    if len(tokens) == 0:
        return np.zeros(vector_size)
    return np.mean([embeddings_index[token] for token in tokens], axis=0)

# Compute document vectors for the training and testing sets
vector_size = 100  # GloVe vector size
X_train_glove = np.array([document_vector(tokens, glove_embeddings, vector_size) for tokens in X_train_tokens])
X_test_glove = np.array([document_vector(tokens, glove_embeddings, vector_size) for tokens in X_test_tokens])

# Initialize Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Train the model
lr_model.fit(X_train_glove, y_train)

# Make predictions
y_pred = lr_model.predict(X_test_glove)

# Evaluate the model
print('Logistic Regression with GloVe')
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


Logistic Regression with GloVe
              precision    recall  f1-score   support

           0       0.88      0.82      0.85      4935
           1       0.84      0.89      0.86      5063

    accuracy                           0.86      9998
   macro avg       0.86      0.86      0.86      9998
weighted avg       0.86      0.86      0.86      9998

Accuracy: 0.8554710942188438


Feature Extraction using Glove and Model Training using Decision Tree

In [None]:
from gensim.utils import simple_preprocess
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import zipfile
import os



glove_zip_path = '/content/glove.6B.zip'
glove_txt_path = '/content/glove.6B.100d.txt'

if not os.path.exists(glove_txt_path):
    with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
        zip_ref.extract(glove_txt_path)

# Load GloVe vectors
def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings(glove_txt_path)

# Tokenize the text data
X_train_tokens = X_train.apply(simple_preprocess)
X_test_tokens = X_test.apply(simple_preprocess)

# Function to compute the average GloVe vector for a document
def document_vector(tokens, embeddings_index, vector_size):
    # Remove out-of-vocabulary words
    tokens = [token for token in tokens if token in embeddings_index]
    if len(tokens) == 0:
        return np.zeros(vector_size)
    return np.mean([embeddings_index[token] for token in tokens], axis=0)

# Compute document vectors for the training and testing sets
vector_size = 100  # GloVe vector size
X_train_glove = np.array([document_vector(tokens, glove_embeddings, vector_size) for tokens in X_train_tokens])
X_test_glove = np.array([document_vector(tokens, glove_embeddings, vector_size) for tokens in X_test_tokens])

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train_glove, y_train)

# Make predictions
y_pred = dt_model.predict(X_test_glove)

# Evaluate the model
print('Decision Tree with GloVe')
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


Decision Tree with GloVe
              precision    recall  f1-score   support

           0       0.79      0.79      0.79      4935
           1       0.80      0.79      0.79      5063

    accuracy                           0.79      9998
   macro avg       0.79      0.79      0.79      9998
weighted avg       0.79      0.79      0.79      9998

Accuracy: 0.7920584116823365


Feature Extraction using tfidf and Model Training using Decision Tree

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = dt_model.predict(X_test_tfidf)

# Evaluate the model
print('Decision Tree with TF-IDF')
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


Decision Tree with TF-IDF
              precision    recall  f1-score   support

           0       0.83      0.84      0.84      4935
           1       0.84      0.83      0.84      5063

    accuracy                           0.84      9998
   macro avg       0.84      0.84      0.84      9998
weighted avg       0.84      0.84      0.84      9998

Accuracy: 0.8384676935387078


Feature Extraction using CountVectorizer and Model Training using Decision Tree



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score


# Feature extraction using CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train_counts, y_train)

y_pred = dt_model.predict(X_test_counts)

# Evaluate the model
print('Decision Tree with CountVectorizer')
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


Decision Tree with CountVectorizer
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      4935
           1       0.82      0.84      0.83      5063

    accuracy                           0.83      9998
   macro avg       0.83      0.83      0.83      9998
weighted avg       0.83      0.83      0.83      9998

Accuracy: 0.8271654330866173


Feature Extraction using Word2Vec and Model Training using Decision Tree


In [None]:
from gensim.models import Word2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

X_train_tokens = X_train.apply(word_tokenize)

w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
def document_vector(tokens, model, vector_size):
    token_vectors = [model.wv[token] for token in tokens if token in model.wv.key_to_index]
    if not token_vectors:
        return np.zeros(vector_size)
    return np.mean(token_vectors, axis=0)


X_train_w2v = np.array([document_vector(tokens, w2v_model, 100) for tokens in X_train_tokens])

# Tokenize and compute document vectors for test set
X_test_tokens = X_test.apply(word_tokenize)
X_test_w2v = np.array([document_vector(tokens, w2v_model, 100) for tokens in X_test_tokens])
# Initialize Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train_w2v, y_train)
# Make predictions
y_pred = dt_model.predict(X_test_w2v)

# Evaluate the model
print('Decision Tree with Word2Vec')
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Decision Tree with Word2Vec
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      4935
           1       0.84      0.84      0.84      5063

    accuracy                           0.84      9998
   macro avg       0.84      0.84      0.84      9998
weighted avg       0.84      0.84      0.84      9998

Accuracy: 0.8424684936987398
