<a href="https://colab.research.google.com/github/Rahulappu2004/Fake-Account-Detection-in-Twitter-X/blob/main/Word2Vec_XGBOOST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Word2Vec XGBOOST**

In [None]:
!pip install gensim xgboost scikit-learn pandas




In [None]:
pip install --upgrade xgboost




**Import necessary libraries**

In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import random

**Load the dataset**

In [None]:
file_path = '/content/twitter_label.txt'
data = pd.read_csv(file_path, sep='\t', header=None, names=['Tweet', 'Label'])

**Preprocess the data**

In [None]:
def preprocess_text(text):
    return text.lower().split()

data['Processed_Tweet'] = data['Tweet'].apply(preprocess_text)

**Introduce Random Typos into Tweets**


In [None]:
def add_typos(tweet):
    noisy_tweet = []
    for word in tweet:
        if random.random() < 0.1:
            noisy_word = list(word)
            if len(noisy_word) > 1:
                i = random.randint(0, len(noisy_word) - 1)
                noisy_word[i] = random.choice('abcdefghijklmnopqrstuvwxyz')
            noisy_tweet.append(''.join(noisy_word))
        else:
            noisy_tweet.append(word)
    return noisy_tweet

In [None]:
def flip_label(label):
    if random.random() < 0.05:  # 5% chance to flip the label
        return 'Human' if label == 'Bot' else 'Bot'
    return label

data['Processed_Tweet'] = data['Processed_Tweet'].apply(add_typos)
data['Label'] = data['Label'].apply(flip_label)

**Building Word2Vec model**

In [None]:
w2v_model = Word2Vec(sentences=data['Processed_Tweet'], vector_size=100, window=5, min_count=1, workers=4, seed=42)

**Generating Tweet Embeddings**

In [None]:
def tweet_to_vec(tweet):
    vectors = [w2v_model.wv[word] for word in tweet if word in w2v_model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

data['Vector'] = data['Processed_Tweet'].apply(tweet_to_vec)

**Splitting 'Data Label' and 'Transformed Vector'**

In [None]:
X = np.vstack(data['Vector'])
y = LabelEncoder().fit_transform(data['Label'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Fitting and building XGboost algorithm**

In [None]:
import xgboost as xgb

# Convert data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',       # Evaluation metric
    'eta': 0.1,                     # Learning rate
    'max_depth': 4,                 # Tree depth
    'subsample': 0.8,               # Row sampling
    'colsample_bytree': 0.8,        # Feature sampling
    'lambda': 1.0,                  # L2 regularization
    'random_state': 42              # For reproducibility
}

# Specify evaluation sets
evals = [(dtrain, 'train'), (dtest, 'test')]

# Train with early stopping
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=100,           # Maximum number of boosting rounds
    evals=evals,                   # Evaluation sets
    early_stopping_rounds=10,      # Stop if no improvement
    verbose_eval=False             # Suppress detailed output
)

In [None]:
y_pred = (bst.predict(dtest) > 0.5).astype(int)

**Evaluation of performance of the model**

In [None]:
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.91
Precision: 0.91
Recall: 0.91
F1 Score: 0.91
