In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk     # It is the one of most comman libraries for Natural Language Process
import re       # Regular Expression library


In [2]:
data = pd.read_csv('gender-classifier-DFE-791531.csv',encoding='latin-1')  
print(data.shape)
data.head()

(20050, 26)


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,created,...,profileimage,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone
0,815719226,False,finalized,3,10/26/15 23:24,male,1.0,yes,1.0,12/5/13 1:48,...,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,6.5873e+17,main; @Kan1shk3,Chennai
1,815719227,False,finalized,3,10/26/15 23:30,male,1.0,yes,1.0,10/1/12 13:51,...,https://pbs.twimg.com/profile_images/539604221...,0,C0DEED,ÛÏIt felt like they were my friends and I was...,,7471,10/26/15 12:40,6.5873e+17,,Eastern Time (US & Canada)
2,815719228,False,finalized,3,10/26/15 23:33,male,0.6625,yes,1.0,11/28/14 11:30,...,https://pbs.twimg.com/profile_images/657330418...,1,C0DEED,i absolutely adore when louis starts the songs...,,5617,10/26/15 12:40,6.5873e+17,clcncl,Belgrade
3,815719229,False,finalized,3,10/26/15 23:10,male,1.0,yes,1.0,6/11/09 22:39,...,https://pbs.twimg.com/profile_images/259703936...,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,,1693,10/26/15 12:40,6.5873e+17,"Palo Alto, CA",Pacific Time (US & Canada)
4,815719230,False,finalized,3,10/27/15 1:15,female,1.0,yes,1.0,4/16/14 13:23,...,https://pbs.twimg.com/profile_images/564094871...,0,0,Watching Neighbours on Sky+ catching up with t...,,31462,10/26/15 12:40,6.5873e+17,,


In [3]:
data = pd.concat([data.gender, data.text], axis=1)    # We are taking only text and gender data to train model
data.dropna(axis = 0, inplace = True)                 # We dropped the null rows
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19953 entries, 0 to 20049
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  19953 non-null  object
 1   text    19953 non-null  object
dtypes: object(2)
memory usage: 467.6+ KB


In [4]:
data.gender = [1 if gender == "female" else 0 for gender in data.gender]
print(data.gender.value_counts())

0    13253
1     6700
Name: gender, dtype: int64


In [5]:
# To download ntlk packages which we are going to use here.
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
text_list = []                             # We created a list so we after these steps, we will append into this list
for text in data.text:
    text = re.sub("[^a-zA-Z]", " ", text)  # Sub method finds the given pattern ([^a-zA-Z] means, NOT letter like ":") and changes them with " " (space)
    text = text.lower()                    # We need to have all letters lowercase (because A is not equall to a)
    text = nltk.word_tokenize(text)        # We make a word list from our text
    lemma = nltk.WordNetLemmatizer()
    text = [lemma.lemmatize(word) for word in text]     # We found the roots of each words with lemma
    text = " ".join(text)                  # After all these steps,we joined the words together and remake our text.
    text_list.append(text)                 # Append these texts into the list we created.

In [7]:
# Now, we will implement "bag of words" method

max_features = 10000               # We will take top 10000 feature 

cv = CountVectorizer(max_features=max_features, stop_words = "english")
# In this method, we remove the stopwords (irrelevant words) in English language. (like "of", "and", "the" etc.)

sparce_matrix = cv.fit_transform(text_list).toarray()
print("top used {} words: {}".format(max_features, cv.get_feature_names()))



In [8]:
y = data.iloc[:, 0].values
x = sparce_matrix

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.08, random_state = 40)  # Train test split

In [9]:
lr = LogisticRegression(max_iter = 2000)
lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)   # Prediction

In [10]:
accuracy = 100.0 * metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  70.44458359423919
