# Importing libraries, utilities

In [None]:
# pip install -U imbalanced-learn

In [1]:
import pickle

In [2]:
from time import time

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import re

import ast

import seaborn as sns

from scipy.sparse import csr_matrix

from scipy.stats import uniform, loguniform, randint, norm, chi2_contingency

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.cluster import KMeans, DBSCAN

from sklearn.decomposition import PCA

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier

from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_score

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler, PolynomialFeatures, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

from sklearn.base import BaseEstimator, TransformerMixin

import ipywidgets as widgets
from ipywidgets import interact

In [4]:
import imblearn
from imblearn.over_sampling import RandomOverSampler

In [5]:
from sklearn.utils import resample

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
from text_preprocessor import TextPreprocessor  # Import the TextPreprocessor class

# Loading the data

We load the raw dataset, entitled "big.csv", into a Pandas dataframe 'names'. 

# EDA 

We observe that our dataset comprises some 200 million records, consisting of fields 'firstname', 'lastname', and 'gender'. (We shall presently opt to merge firstname and lastname into a single 'name'.)

The top entry in each of the three fields happens to be '\N'. While it makes up a fair proportion of each field, this is most marked in the case of 'gender', where it accounts for an overwhelming 95%. 

Basic inspection of the dataframe shows that all entries are string-type. An important takeaway is that case-sensitivity is misleading the analysis. (For instance, lastname currently distinguishes between 'Kumar' and 'kumar'; likewise, gender between 'M' and 'm', 'F' and 'f'.) 


# Data preprocessing

We uppercase all entries for uniformity. 
We then drop records where both firstname and lastname are '\N'; then drop records where gender is anything other than male or female.

We concatenate firstname and lastname into a single column 'name' (separated by a space), then strip it of leading and trailing whitespace. 
We encode gender in binary as 'target'. 
Retain only name and target as our dataframe.

Then observe that name contains some null values, so we drop those records.

Observe that some of the most popular names are aliases (specifically, '8888 DATING', 'GAANA USER', 'GUEST \\N', and 'GUEST LOGIN'), so we drop those records. 

We drop "anti-English" records, *i.e.,* records where name contains less than two English letters. 

Next, we drop "junk" English records, where name either contains just one distinct letter, or doesn't contain any vowels.



*The dataframe at this stage is entitled 'processed_data'.*

Lastly, we strip the names of all non-alphabetical, non-whitespace characters; then once more strip them of leading and trailing whitespace.

**Now, the dataframe is 'new_processed_data'.**

## Data modification (for consistent labelling):

### Dataframe:

In [None]:
# print(df)

### Distinct names, alongside corresponding counts:

In [None]:
# name_counts = df['name'].value_counts()
# print(name_counts)

**4.4 million distinct names.**

### Names with count > 1, alongside their counts:

In [None]:
# filtered_names = name_counts[name_counts > 1]
# print(filtered_names)

### Number of names with count 1:

In [None]:
# df.shape[0]-filtered_names.values.sum()

### Names with count > 1, alongside splits of their abovementioned true label counts:

In [None]:
# # Filter the original dataframe to include only names with count > 1
# filtered_df = df[df['name'].isin(filtered_names.index)]

# # Group by 'name' and 'gender' and count occurrences
# label_counts = filtered_df.groupby(['name', 'target']).size().unstack(fill_value=0)

# # Reindex the label_counts DataFrame based on the index of filtered_names
# label_counts = label_counts.reindex(index=filtered_names.index)

# # Print the count of each label for each name
# print(label_counts)

In [None]:
# # Determine the dominant label for each name
# label_counts['dominant_label'] = label_counts.idxmax(axis=1)

# # Print the dominant label for each name
# print(label_counts['dominant_label'])

In [None]:
# duplicated_labels = label_counts['dominant_label'].reindex(filtered_names.index).repeat(filtered_names.values)
# print(duplicated_labels)

In [None]:
# duplicated_labels_df = duplicated_labels.reset_index()
# duplicated_labels_df.columns = ['name', 'target']

In [None]:
# new_series = name_counts[name_counts == 1]

In [None]:
# new_series_df = new_series.reset_index()
# new_series_df.columns = ['name', 'target']

In [None]:
# df = pd.concat([duplicated_labels_df, new_series_df], ignore_index=True)

**The dataframe at this stage is accessible as 'latest_processed_data'.**

In [8]:
df = pd.read_csv('/root/Abhinav/Gender Prediction Project/latest_processed_data.csv')

# Train setup

In [9]:
X = df['name']
y = df['target']

### Oversampling to balance the (training) dataset:

In [10]:
# Oversample the minority class in the training data
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X.to_frame(), y)

# Machine learning

### Model (pipeline) setup:

In [56]:
# Create a pipeline with TF-IDF vectorizer and Logistic Regression
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(random_state=42))
])

### Training:

In [57]:
# Fit the pipeline on the oversampled data
start_time = time()
pipeline.fit(X_resampled['name'], y_resampled)
end_time = time()

In [58]:
print(f"Pipeline time: {(end_time - start_time):.4f} seconds")

Pipeline time: 251.0804 seconds


~4 mins to fit.

### Junk support:

In [84]:
def extract_english_part(input_string):
    return ''.join(re.findall(r'[a-zA-Z]+', input_string))

def isJunk(input_string):
    english_input = extract_english_part(input_string)
    return len(set(english_input)) < 3 or not any(char in 'AEIOUaeiou' for char in english_input)


In [85]:
class GenderPredictor(BaseEstimator, TransformerMixin):
    def __init__(self, pipeline, isJunk):
        self.pipeline = pipeline
        self.isJunk = isJunk

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        predictions = []
        for name in X:
            if self.isJunk(name):
                predictions.append(2)
            else:
                gender = self.pipeline.predict([name])[0]
                predictions.append(gender)
        return predictions

In [87]:
# Assuming `pipeline` is your trained model and `isJunk` is your checker function
gender_predictor = GenderPredictor(pipeline, isJunk)

# List of names to predict gender for
X = ['Alice', 'Bob', "sneha jaiswal", "abhinav ", 'aaaaa bbbb', '?????']

# Predict genders for the names
predictions = gender_predictor.predict(X)

print(predictions)

[0, 1, 0, 1, 2, 2]


In [69]:
# # Export the pipeline to a .pkl file
# with open('final_pipeline.pkl', 'wb') as file:
#     pickle.dump(pipeline, file)