### Gender Classification

In [82]:
# Import required libraries

import pandas as pd # process data
from nltk.corpus import stopwords # stop words from common NLP library
import regex as re # regular expression

#### Exploratory Data Analysis

In [26]:
# Read in the data as a dataframe

df = pd.read_csv('gender-classifier.csv', encoding = 'latin1')

In [27]:
# Take a look at the dataframe

df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,created,...,profileimage,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone
0,815719226,False,finalized,3,10/26/15 23:24,male,1.0,yes,1.0,12/5/13 1:48,...,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,6.5873e+17,main; @Kan1shk3,Chennai
1,815719227,False,finalized,3,10/26/15 23:30,male,1.0,yes,1.0,10/1/12 13:51,...,https://pbs.twimg.com/profile_images/539604221...,0,C0DEED,ÛÏIt felt like they were my friends and I was...,,7471,10/26/15 12:40,6.5873e+17,,Eastern Time (US & Canada)
2,815719228,False,finalized,3,10/26/15 23:33,male,0.6625,yes,1.0,11/28/14 11:30,...,https://pbs.twimg.com/profile_images/657330418...,1,C0DEED,i absolutely adore when louis starts the songs...,,5617,10/26/15 12:40,6.5873e+17,clcncl,Belgrade
3,815719229,False,finalized,3,10/26/15 23:10,male,1.0,yes,1.0,6/11/09 22:39,...,https://pbs.twimg.com/profile_images/259703936...,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,,1693,10/26/15 12:40,6.5873e+17,"Palo Alto, CA",Pacific Time (US & Canada)
4,815719230,False,finalized,3,10/27/15 1:15,female,1.0,yes,1.0,4/16/14 13:23,...,https://pbs.twimg.com/profile_images/564094871...,0,0,Watching Neighbours on Sky+ catching up with t...,,31462,10/26/15 12:40,6.5873e+17,,


In [28]:
df.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'gender', 'gender:confidence', 'profile_yn',
       'profile_yn:confidence', 'created', 'description', 'fav_number',
       'gender_gold', 'link_color', 'name', 'profile_yn_gold', 'profileimage',
       'retweet_count', 'sidebar_color', 'text', 'tweet_coord', 'tweet_count',
       'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone'],
      dtype='object')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  pr

In [30]:
# Function for total number of records

def shape(df):
    shape = df.shape
    return("There are {} rows and {} columns in this dataset.".format(shape[0], shape[1]))

In [31]:
shape(df)

'There are 20050 rows and 26 columns in this dataset.'

In [32]:
# Check for NaN values

df.isnull().sum()

_unit_id                     0
_golden                      0
_unit_state                  0
_trusted_judgments           0
_last_judgment_at           50
gender                      97
gender:confidence           26
profile_yn                   0
profile_yn:confidence        0
created                      0
description               3744
fav_number                   0
gender_gold              20000
link_color                   0
name                         0
profile_yn_gold          20000
profileimage                 0
retweet_count                0
sidebar_color                0
text                         0
tweet_coord              19891
tweet_count                  0
tweet_created                0
tweet_id                     0
tweet_location            7484
user_timezone             7798
dtype: int64

In [33]:
# Since gender are highly important for this project,
# I decided the amount of data would still be sufficient
# after dropping rows with NaN values.

df.dropna(subset=['gender'], inplace=True)

In [34]:
df.isnull().sum()

_unit_id                     0
_golden                      0
_unit_state                  0
_trusted_judgments           0
_last_judgment_at           50
gender                       0
gender:confidence            0
profile_yn                   0
profile_yn:confidence        0
created                      0
description               3729
fav_number                   0
gender_gold              19903
link_color                   0
name                         0
profile_yn_gold          19903
profileimage                 0
retweet_count                0
sidebar_color                0
text                         0
tweet_coord              19794
tweet_count                  0
tweet_created                0
tweet_id                     0
tweet_location            7442
user_timezone             7768
dtype: int64

In [35]:
# The focus will be on the following variables 
# gender - male, female or brand
# description - twitter profile description
# text - the text from a tweet

df1 = df[['gender', 'description', 'text']]

In [36]:
df1.head()

Unnamed: 0,gender,description,text
0,male,i sing my own rhythm.,Robbie E Responds To Critics After Win Against...
1,male,I'm the author of novels filled with family dr...,ÛÏIt felt like they were my friends and I was...
2,male,louis whining and squealing and all,i absolutely adore when louis starts the songs...
3,male,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...",Hi @JordanSpieth - Looking at the url - do you...
4,female,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,Watching Neighbours on Sky+ catching up with t...


In [37]:
df1.isnull().sum()

gender            0
description    3729
text              0
dtype: int64

In [38]:
shape(df1)

'There are 19953 rows and 3 columns in this dataset.'

In [60]:
df1['gender'].unique()

array(['male', 'female', 'brand', 'unknown'], dtype=object)

In [63]:
df1.groupby(['gender']).count()

In [75]:
df1.drop(df1[df1['gender'] == 'brand'].index, inplace=True)
# df = df[df['gender'] != 'brand']

In [76]:
df1.groupby(['gender']).count()

Unnamed: 0_level_0,description,text
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,5725,6700
male,5469,6194
unknown,702,1117


#### Cleaning the Tweets and Profile Description

In [77]:
def cleaning(s):
    
    # function using sub methods to clean data by finding what to replace
    # and substituting or replacing it
    
    s = str(s)
    s = s.lower()
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub(r'[^\w]', ' ', s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_]', '', s)
    s = s.replace("co","")
    s = s.replace("https","")
    s = s.replace(",","")
    s = s.replace("[\w*"," ")
    return s

In [83]:
# Create new columns and apply cleaning function to text and description

df1['Tweets'] = [cleaning(s) for s in df1['text']]
df1['Description'] = [cleaning(s) for s in df1['description']]

In [84]:
# Load stop words, split Tweet string and lowercase, remove stop words

stop = set(stopwords.words('english'))
df1['Tweets'] = df1['Tweets'].str.lower().str.split()
df1['Tweets'] = df1['Tweets'].apply(lambda x : [item for item in x if item not in stop])

In [85]:
# Check the results

df1.head()

Unnamed: 0,gender,description,text,Tweets,Description
0,male,i sing my own rhythm.,Robbie E Responds To Critics After Win Against...,"[robbie, e, responds, critics, win, eddie, edw...",i sing my own rhythm
1,male,I'm the author of novels filled with family dr...,ÛÏIt felt like they were my friends and I was...,"[ûïit, felt, like, friends, living, story, û, ...",i m the author of novels filled with family dr...
2,male,louis whining and squealing and all,i absolutely adore when louis starts the songs...,"[absolutely, adore, louis, starts, songs, hits...",louis whining and squealing and all
3,male,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...",Hi @JordanSpieth - Looking at the url - do you...,"[hi, jordanspieth, looking, url, use, ifttt, t...",mobile guy ers shazam google kleiner perkins y...
4,female,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,Watching Neighbours on Sky+ catching up with t...,"[watching, neighbours, sky, catching, neighbs,...",ricky wilson the best frontman kaiser chiefs t...
