**Task**
1. Build and test a text classifier based on the age of a user according their age classes (0-10, 11-20, 21-30, 31+).

2. Build a ML name classifier that classifies the age of a user according their age classes (0-10, 11-20, 21-30, 31+).

3. Build a meta classifier that combines the previously built classifiers based on their age classes (0-10, 11-20, 21-30, 31+).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB as bayes
from sklearn.feature_extraction.text import CountVectorizer as countvec
from sklearn.metrics import accuracy_score as accuracy
from sklearn.ensemble import RandomForestClassifier 
import random
random.seed(0)

In [2]:
# Import data and show some information
data = pd.read_pickle('./twitterData.pkl')
print(data.shape)
data.head(10)

(2916, 6)


Unnamed: 0,screen_name,name,tweets_concatenated,avatar_url,gender,age
0,DatZerooo,David,Warum riecht mein Bruder nach Pizza wenn er ei...,http://pbs.twimg.com/profile_images/7569661512...,M,16.0
0,reap705,Oliver Gast,[CSS] Ein Off-canvas-Menü mit Dropdown-Navigat...,http://pbs.twimg.com/profile_images/1366984169...,M,15.0
0,Dossary1995,Khalid Aldossary,,http://pbs.twimg.com/profile_images/4167523242...,M,37.0
0,eduUu06,eduUu,heut abend kogge und morgen endlich haare ab :...,http://pbs.twimg.com/profile_images/896480580/...,M,46.0
0,Narutofreak935,Avengar,@GrandlineTV gib nicht auf und mach dein Ding ...,http://pbs.twimg.com/profile_images/7317971734...,F,19.0
0,miley_sarah,Sarah,"RT @bomelino: Das ""Backe, backe Kuchen""-Lied i...",http://pbs.twimg.com/profile_images/7584397626...,M,18.0
0,DerIncubus,Der Incubus,@NicoleAllm Na ... gut ins neue Jahr gestartet...,http://pbs.twimg.com/profile_images/5808649200...,F,22.0
0,Petouser,ペトユサ (Petoyusa),Verschwörungstheorie: Pokemon Go wird von der ...,http://pbs.twimg.com/profile_images/6626943925...,M,26.0
0,ChrisWhite126,Chris White,Ach du scheiße ist das warm. :( Hab locker 5kg...,http://pbs.twimg.com/profile_images/6463341220...,M,37.0
0,MusicFreakFever,Nadine (:,Heute nur DREI (!) Schulstunden !\n Ich LIIIIE...,http://pbs.twimg.com/profile_images/3788000003...,M,15.0


In [3]:
# Remove instances without tweets information
data[data['tweets_concatenated'] == ''].shape

(430, 6)

In [4]:
data = data[data['tweets_concatenated'] != '']
data.shape

(2486, 6)

In [5]:
# Remove instances without age information
data.isnull().sum()

screen_name               0
name                      0
tweets_concatenated       0
avatar_url                0
gender                    0
age                    1349
dtype: int64

In [6]:
data.dropna(subset=['age'], inplace=True)
data

Unnamed: 0,screen_name,name,tweets_concatenated,avatar_url,gender,age
0,DatZerooo,David,Warum riecht mein Bruder nach Pizza wenn er ei...,http://pbs.twimg.com/profile_images/7569661512...,M,16.0
0,reap705,Oliver Gast,[CSS] Ein Off-canvas-Menü mit Dropdown-Navigat...,http://pbs.twimg.com/profile_images/1366984169...,M,15.0
0,eduUu06,eduUu,heut abend kogge und morgen endlich haare ab :...,http://pbs.twimg.com/profile_images/896480580/...,M,46.0
0,Narutofreak935,Avengar,@GrandlineTV gib nicht auf und mach dein Ding ...,http://pbs.twimg.com/profile_images/7317971734...,F,19.0
0,miley_sarah,Sarah,"RT @bomelino: Das ""Backe, backe Kuchen""-Lied i...",http://pbs.twimg.com/profile_images/7584397626...,M,18.0
...,...,...,...,...,...,...
0,tinamuellerluge,Tina Müller,RT @sportDland: Da hört ihr's: Gebt alles! 💪\n...,http://pbs.twimg.com/profile_images/6793113826...,M,16.0
0,charmingLiisa,Liisa,"@Frau_Dokta &lt;3||""Das Jahrhundert des Le Cor...",http://pbs.twimg.com/profile_images/6470215469...,F,51.0
0,MajaSpecht,Maja Specht,😍😍 http://t.co/c6lb259T67||,http://pbs.twimg.com/profile_images/6003719710...,F,18.0
0,GabiHillerOhm,Gabriele Hiller-Ohm,Guten Morgen aus der Hansestadt.||Nachtwanderu...,http://pbs.twimg.com/profile_images/6498112476...,F,63.0


In [7]:
# Group users in age classes by replacing: (0-10) = 0, (11-20) = 1, (21-30) = 2, (31+) = 3 
data['age'] = np.where(data['age'].between(0,11), 0, data['age'])
data['age'] = np.where(data['age'].between(11,21), 1, data['age'])
data['age'] = np.where(data['age'].between(21,31), 2, data['age'])
data['age'] = np.where(data['age'].between(31,100), 3, data['age'])
data

Unnamed: 0,screen_name,name,tweets_concatenated,avatar_url,gender,age
0,DatZerooo,David,Warum riecht mein Bruder nach Pizza wenn er ei...,http://pbs.twimg.com/profile_images/7569661512...,M,1.0
0,reap705,Oliver Gast,[CSS] Ein Off-canvas-Menü mit Dropdown-Navigat...,http://pbs.twimg.com/profile_images/1366984169...,M,1.0
0,eduUu06,eduUu,heut abend kogge und morgen endlich haare ab :...,http://pbs.twimg.com/profile_images/896480580/...,M,3.0
0,Narutofreak935,Avengar,@GrandlineTV gib nicht auf und mach dein Ding ...,http://pbs.twimg.com/profile_images/7317971734...,F,1.0
0,miley_sarah,Sarah,"RT @bomelino: Das ""Backe, backe Kuchen""-Lied i...",http://pbs.twimg.com/profile_images/7584397626...,M,1.0
...,...,...,...,...,...,...
0,tinamuellerluge,Tina Müller,RT @sportDland: Da hört ihr's: Gebt alles! 💪\n...,http://pbs.twimg.com/profile_images/6793113826...,M,1.0
0,charmingLiisa,Liisa,"@Frau_Dokta &lt;3||""Das Jahrhundert des Le Cor...",http://pbs.twimg.com/profile_images/6470215469...,F,3.0
0,MajaSpecht,Maja Specht,😍😍 http://t.co/c6lb259T67||,http://pbs.twimg.com/profile_images/6003719710...,F,1.0
0,GabiHillerOhm,Gabriele Hiller-Ohm,Guten Morgen aus der Hansestadt.||Nachtwanderu...,http://pbs.twimg.com/profile_images/6498112476...,F,3.0


In [8]:
data['age'] = data['age'].astype('int')
data

Unnamed: 0,screen_name,name,tweets_concatenated,avatar_url,gender,age
0,DatZerooo,David,Warum riecht mein Bruder nach Pizza wenn er ei...,http://pbs.twimg.com/profile_images/7569661512...,M,1
0,reap705,Oliver Gast,[CSS] Ein Off-canvas-Menü mit Dropdown-Navigat...,http://pbs.twimg.com/profile_images/1366984169...,M,1
0,eduUu06,eduUu,heut abend kogge und morgen endlich haare ab :...,http://pbs.twimg.com/profile_images/896480580/...,M,3
0,Narutofreak935,Avengar,@GrandlineTV gib nicht auf und mach dein Ding ...,http://pbs.twimg.com/profile_images/7317971734...,F,1
0,miley_sarah,Sarah,"RT @bomelino: Das ""Backe, backe Kuchen""-Lied i...",http://pbs.twimg.com/profile_images/7584397626...,M,1
...,...,...,...,...,...,...
0,tinamuellerluge,Tina Müller,RT @sportDland: Da hört ihr's: Gebt alles! 💪\n...,http://pbs.twimg.com/profile_images/6793113826...,M,1
0,charmingLiisa,Liisa,"@Frau_Dokta &lt;3||""Das Jahrhundert des Le Cor...",http://pbs.twimg.com/profile_images/6470215469...,F,3
0,MajaSpecht,Maja Specht,😍😍 http://t.co/c6lb259T67||,http://pbs.twimg.com/profile_images/6003719710...,F,1
0,GabiHillerOhm,Gabriele Hiller-Ohm,Guten Morgen aus der Hansestadt.||Nachtwanderu...,http://pbs.twimg.com/profile_images/6498112476...,F,3


In [9]:
# Split the data into two trainings (trainSub, trainMeta) and one test set (test)
trainSub, tempData = train_test_split(data, test_size=0.4)
trainMeta, test = train_test_split(tempData, test_size=0.4)
print(trainSub.shape, trainMeta.shape, test.shape)

(682, 6) (273, 6) (182, 6)


In [10]:
# Extract tweets and names from the data
trainSub_tweets = trainSub['tweets_concatenated']
trainMeta_tweets = trainMeta['tweets_concatenated']
test_tweets = test['tweets_concatenated']
trainSub_names = trainSub['name']
trainMeta_names = trainMeta['name']
test_names = test['name']

In [11]:
# Extract the target column for each dataset
y_trainSub = trainSub['age']
y_trainMeta = trainMeta['age']
y_test = test['age']

In [12]:
trainSub_tweets.head(10)

0    Seid glücklich mit dem was ihr habt. Und forde...
0    morgen nach polen &lt;3||hat im moment keine w...
0    @DHLPaket Jetzt ja, Karte kam heute nun auch p...
0    @Berndtdasbrot auweia :D||@Berndtdasbrot armer...
0    Ärger über Samsung - das wars! Bis auf weitere...
0    http://www.sat1.de/filme_serien/ncis/video/gan...
0    ICH BIN SCHARF http://t.co/AEdEJDu||FREAK in m...
0    @iBlali ohmein gott du passt dich dem Boden an...
0    Habe nun eine Ps4. Wer mich gerne adden will P...
0    @dagibeee ach dagi ist echt scheiße aber du mu...
Name: tweets_concatenated, dtype: object

In [13]:
y_trainSub.head(10)

0    0
0    1
0    2
0    1
0    3
0    3
0    1
0    1
0    2
0    1
Name: age, dtype: int32

In [14]:
# 1. Build and test a text classifier based on the age of a user according their age classes (0-10, 11-20, 21-30, 31+)

# ML models don't recognize "tweets", so we need to apply a Bag of words in order to vectorize the words in the tweets and apply our model to this new dataset.
# This 'submodel' will provide the first stack of training and testing data for the final meta ML model.

In [15]:
# Use a Bag of Words Approach and train a classification model on trainSub_tweets
countvectorize_tweets = countvec()
X_trainSub_tweets = countvectorize_tweets.fit_transform(trainSub_tweets)
X_trainMeta_tweets = countvectorize_tweets.transform(trainMeta_tweets)
X_test_tweets = countvectorize_tweets.transform(test_tweets)

In [16]:
bayes_tweets = bayes()
bayes_tweets.fit(X_trainSub_tweets, y_trainSub)
tweet_score = bayes_tweets.score(X_test_tweets, y_test)

In [17]:
tweetScore_text = "Tweet SCore is {:0.2%}".format(tweet_score)
print(tweetScore_text)

Tweet SCore is 65.38%


In [18]:
stacked_input1 = pd.Series(bayes_tweets.predict(X_trainMeta_tweets))
stacked_input1_test = pd.Series(bayes_tweets.predict(X_test_tweets))

In [19]:
# 2. Build a ML name classifier that classifies the age of a user according their age classes (0-10, 11-20, 21-30, 31+)

# Similar to the text classifier, we train a model on our names.
# This submodel will provide the second stack of training and testing data for the final meta ML model

In [20]:
cvectorizer_names = countvec()
x_trainSub_names = cvectorizer_names.fit_transform(trainSub_names)

x_trainMeta_names = cvectorizer_names.transform(trainMeta_names)
x_test_names = cvectorizer_names.transform(test_names)

cvectorizer_names.get_feature_names()

['18',
 '19blockbuster97',
 '1pundplays',
 '_rudi_',
 'abrell',
 'achtelik',
 'addi',
 'adrian',
 'aka',
 'alawehayyealawe',
 'alber',
 'alex',
 'alexander',
 'alfath',
 'alfred',
 'aliiiina',
 'alina',
 'alt',
 'alysha',
 'ameri',
 'ami',
 'amir',
 'amy',
 'anastasia',
 'andi',
 'andrea',
 'andreas',
 'andré',
 'angelina',
 'anica',
 'anika',
 'anime',
 'anissa',
 'anja',
 'anke',
 'ann',
 'anna',
 'annastasia',
 'anne',
 'annelise',
 'annika',
 'antonia',
 'antonio',
 'arina',
 'armin',
 'armstrong',
 'arrogant',
 'art',
 'auer',
 'augustina',
 'aull',
 'aus',
 'baerbel',
 'baghouz',
 'bakaï',
 'bambina',
 'barkeeper',
 'baron',
 'bassilicum',
 'bayfidan',
 'bea',
 'becci',
 'bee',
 'behn',
 'believe',
 'ben',
 'benjamin',
 'benny',
 'berger',
 'berkiix3',
 'bernd',
 'berschick',
 'bianca',
 'bibi',
 'birger',
 'blaha',
 'blanco',
 'bloch',
 'blog',
 'bluehair',
 'bluhm',
 'blume',
 'boy',
 'brandon',
 'brauer',
 'braun',
 'brendler',
 'brini',
 'britta',
 'bronut',
 'brown',
 'brunh

In [21]:
bayes_names = bayes()
bayes_names.fit(x_trainSub_names, y_trainSub)

MultinomialNB()

In [22]:
nameScore = bayes_names.score(x_test_names, y_test)
nameScore_text = "Name Score is {:0.2%}".format(nameScore)
print(nameScore_text)

Name Score is 63.19%


In [23]:
stacked_input2 = pd.Series(bayes_names.predict(x_trainMeta_names))
stacked_input2_test = pd.Series(bayes_names.predict(x_test_names))

In [24]:
# 3. Build a meta classifier that combines the previously built classifiers based on their age classes (0-10, 11-20, 21-30, 31+)

# Now we use the stacks as input for our Meta Classifier

In [25]:
# Build a pandas df for training
meta_data_train = {'input1': stacked_input1, 'input2': stacked_input2}
meta_data_train = pd.DataFrame(meta_data_train)
meta_data_train.head(10)

Unnamed: 0,input1,input2
0,1,1
1,1,1
2,1,1
3,1,1
4,1,2
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [26]:
# Build a pandas df for test
meta_data_test = {'input1': stacked_input1_test, 'input2': stacked_input2_test}
meta_data_test = pd.DataFrame(meta_data_test)
meta_data_test.head(10)

Unnamed: 0,input1,input2
0,2,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [27]:
# Train the Meta Classifier 
forest = RandomForestClassifier()
forest.fit(meta_data_train, y_trainMeta)

RandomForestClassifier()

In [28]:
# Evaluate the Meta Classifier on the test set and compare to previous results
metaScore = forest.score(meta_data_test, y_test)
metaScore_text = "Meta Score is {:0.2%}".format(metaScore)
print(tweetScore_text)
print(nameScore_text)
print(metaScore_text)

Tweet SCore is 65.38%
Name Score is 63.19%
Meta Score is 67.03%
