# Assignment 2 / Author Profiling -- Gender and Age

In [1]:
import nltk
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

## Daten einlesen

In [2]:
messages = pd.read_csv('Messages_train.csv', sep=',')

In [3]:
messages.head()

Unnamed: 0,id,text,person_gender,person_age
0,117808,The particular DKNY bags are among the leading...,male,20s
1,5634,Dental implants are posts made of titanium tha...,female,30s
2,94820,"<p style=""text-align:center;]<a href=""http://e...",female,20s
3,64510,The primary business I joined was the Gospel N...,male,20s
4,89288,Taking QR Codes to Another Level<br />QR Codes...,female,30s


In [4]:
print('Gender :', messages['person_gender'][0] , '  Age class :',messages['person_age'][0] )
print()
print( messages['text'][0][:500])

Gender : male   Age class : 20s

The particular DKNY bags are among the leading choices of many of the stars each local and also international, politicians, celebrities, tycoons, as well as the designs. These items feature an exceptional sort of style which can be nevertheless way too hard to face up to. As soon as a single units the girl eyes about the most recent design, she'll definitely desire owning that. This kind of brand is really reputable and renowned that over the years, the merchandise are becoming much sought after


## Daten bereinigen

In [5]:
def cleanse_text(text):
    porter = nltk.PorterStemmer()
    detokenizer  = nltk.tokenize.treebank.TreebankWordDetokenizer()
    stop_words = nltk.corpus.stopwords.words('english')
    sentences_cleansed = []

    # remove speacial characters
    # to lowercase
    # stem with porter
    # remove stopwords
    bag_of_words = []
    progress = 0
    print("(.) means 1'000 records processed")
    for t in text:    
        sent_cleansed = [porter.stem(word.lower()) 
                         for word in nltk.word_tokenize(t)
                         if word.isalpha() 
                         and word.lower() not in stop_words
                        ]
        sentences_cleansed.append(sent_cleansed)
        bag_of_words.extend(sent_cleansed)
        if(progress % 1000 == 0): 
            print('.', end='')
        progress += 1
    print()
    print("Total words available: ", len(bag_of_words))
    
    
    # find most common words to remove
    fdist = nltk.FreqDist(bag_of_words)
    print("Different words: ", len(fdist))
    most_common = fdist.most_common(20)

    # select most common words to remove
    words_to_remove = []
    for word in most_common:     
        words_to_remove.append(word[0])
    print("Additional words to remove: ", words_to_remove)

    # remove additional most frequent words
    sentences_more_stopwords_removed = []
    total_words_count = 0
    progress = 0
    for t in sentences_cleansed:
        sent_cleansed = [word for word in t
                        if word not in words_to_remove]
        sentences_more_stopwords_removed.append(detokenizer.detokenize(sent_cleansed))
        total_words_count = total_words_count + len(sent_cleansed)
        if(progress % 1000 == 0): 
            print('.', end='')
        progress += 1
    print()
    print("Total words remaining: ", total_words_count)
    return sentences_more_stopwords_removed

In [6]:
cleansed_text = cleanse_text(messages['text']) # this takes some time
print(cleansed_text[0][:80])

gender = messages['person_gender'].values.tolist()
age = messages['person_age'].values.tolist()
# Combine age and gender for combined classification, total 6 categories
gender_age = [ '-'.join((g, a)) for g, a in zip(gender, age)]
print(gender[0], age[0], gender_age[0],cleansed_text[0][:80] )

# shuffle and split data into train and test set
train_x_gender, test_x_gender, train_y_gender, test_y_gender = train_test_split(cleansed_text, gender, test_size=0.2)
train_x_age,    test_x_age,    train_y_age,    test_y_age    = train_test_split(cleansed_text, age,    test_size=0.2)
train_x_genage, test_x_genage, train_y_genage, test_y_genage = train_test_split(cleansed_text, gender_age, test_size=0.2)

print(train_y_gender[0]," ::: ", train_x_gender[0][:100])
print(test_y_gender[0], " ::: ", test_x_gender[0][:100])
print(train_y_age[0],   " ::: ", train_x_age[0][:100])
print(test_y_age[0],    " ::: ", test_x_age[0][:100])
print(test_y_genage[0], " ::: ", test_x_genage[0][:100])

print(len(cleansed_text), len(train_x_gender), len(test_x_gender))


(.) means 1'000 records processed
....................................................................................................
Total words available:  24722618
Different words:  147227
Additional words to remove:  ['br', 'http', 'get', 'use', 'make', 'also', 'one', 'time', 'like', 'need', 'nofollow', 'may', 'well', 'go', 'even', 'peopl', 'way', 'want', 'take', 'mani']
....................................................................................................
Total words remaining:  21134272
particular dkni bag among lead choic star local intern politician celebr tycoon 
male 20s male-20s particular dkni bag among lead choic star local intern politician celebr tycoon 
male  :::  rais listen men women blame misfortun reduc often fatal disord escap realli true low lead lot lousi 
female  :::  hi everi
30s  :::  img vp manag left us give lot magnitud cost come choos servic product actual circumst divers subject
10s  :::  effect plan got mean littl without client crucial sy

## Search best fitting classifier

 Sources used:
 
 https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
 
 https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [7]:
# helper methods to find best fitting classifier
def fit_classifier(classifier, X_train, Y_train):
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', classifier),
    ])
    text_clf.fit(X_train, Y_train)
    return text_clf


def classify(classifier, x_test):
    return classifier.predict(x_test)

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier

def run_classifier_test(clf, train_x, test_x, train_y, test_y):
        fitted_classifier = fit_classifier(clf, train_x, train_y)
        test_y_predicted = classify(fitted_classifier,test_x)
        acc = accuracy_score(test_y, test_y_predicted)
        print("pred: ", test_y_predicted[0], "true: " ,test_y[0], test_x[0][:80])
        print("gender accuracy: ", acc)
        print(nltk.ConfusionMatrix(test_y_predicted.tolist(),test_y))

def find_best_fitting_classifier():
    # tested mostly with 5000 - 100'000 messages, 
    # many more parameters tested than showed below, also more classifiers
    classifiers = [ 
        # KNeighborsClassifier(5),                 # gender: 0.553,  age: 0.479, genage: 0.291
        # SVC(kernel="linear", C=1),               # ok, but too slow by > 10'000
        # SVC(kernel="linear", C=0.1),             # ok, but too slow by > 10'000 
        # LinearSVC(C=1),                          # gender: 0.577,  age: 0.565, genage: 0.354

        #####################################################################################
        # Best gender-age : 0.36915
        #####################################################################################
        LinearSVC(C=0.2),                        # gender: 0.581,  age: 0.578, genage: 0.369
        # LinearSVC(C=0.1),                        # gender: 0.580,  age: 0.581, genage: 0.368

        #####################################################################################
        # Best Age 0.58265
        #####################################################################################
        LinearSVC(C=0.05),                        # gender: 0.580,  age: 0.582, genage: 0.367
        # LinearSVC(C=0.01),                       # gender: 0.572,  age: 0.572, genage: 0.362
        # DecisionTreeClassifier(max_depth=5),     # gender: 0.553,  age: 0.551, genage: 0.334
        # RandomForestClassifier(max_depth=5,      # gender: 0.517,  age: 0.533, genage: 0.300
        #                     n_estimators=50, 
        #                     max_features=2), 

        # MLPClassifier(alpha=1),                  # too slow by 100'000
        # MultinomialNB(alpha=2),                  # gender: 0.576,  age: 0.553, genage: 0.323
        # MultinomialNB(alpha=1),                  # gender: 0.577,  age: 0.561, genage: 0.331
        # MultinomialNB(alpha=0.5),                # gender: 0.579,  age: 0.566, genage: 0.342

        ##################################################################################### 
        # Best gender 0.582
        #####################################################################################
        MultinomialNB(alpha=0.1),                # gender: 0.582,  age: 0.574, genage: 0.355 
        # MultinomialNB(alpha=0.05),                # gender: 0.581,  age: 0.577, genage: 0.357 
        #MultinomialNB(alpha=0.01),                # gender: 0.581,  age: 0.578, genage: 0.359
        # ComplementNB(),                          # gender: 0.572,  age: 0.570, genage: 0.354
        # Perceptron(tol=1e-3),                    # gender: 0.548,  age: 0.513, genage: 0.293
        # PassiveAggressiveClassifier(tol=1e-3),   # gender: 0.560,  age: 0.504, genage: 0.312

        #AdaBoostClassifier(),                   # too slow

        # ExtraTreesClassifier(n_estimators=100),  # gender: ,  age: , genage: 

        # SGDClassifier(loss='squared_hinge',      # gender: 0.539,  age: 0.539, genage: 0.307
        #             penalty='l1', alpha=0.0008, random_state=42,tol=1e-3),
        #SGDClassifier(loss='squared_hinge',      # gender: 0.563,  age: 0.564, genage: 0.354
        #              penalty='elasticnet',alpha=0.0008,random_state=42, tol=1e-3),

    ]
      
    for classifier in classifiers:
        print(classifier)
        
        run_classifier_test(classifier, train_x_gender, test_x_gender, train_y_gender, test_y_gender)
        run_classifier_test(classifier, train_x_age,    test_x_age,    train_y_age,    test_y_age)
        run_classifier_test(classifier, train_x_genage, test_x_genage, train_y_genage, test_y_genage)


In [9]:
# not nececcary to run this method, only used to find best classifier
find_best_fitting_classifier()

LinearSVC(C=0.2, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
pred:  male true:  female hi everi
gender accuracy:  0.5785
       |    f      |
       |    e      |
       |    m    m |
       |    a    a |
       |    l    l |
       |    e    e |
-------+-----------+
female |<5223>4053 |
  male | 4377<6347>|
-------+-----------+
(row = reference; col = test)

pred:  30s true:  10s effect plan got mean littl without client crucial system acquir bring client sma
gender accuracy:  0.5867
    |    1    2    3 |
    |    0    0    0 |
    |    s    s    s |
----+----------------+
10s | <172>  57  119 |
20s |  483<2211>1391 |
30s | 2536 3680<9351>|
----+----------------+
(row = reference; col = test)

pred:  male-30s true:  male-30s becom part mlm
gender accuracy:  0.36555
           |    f    f    f                |
           |    e    e 

Aufgrund der Testresultate wurden folgende Classifier und Parameter gewählt:

### Gender (2 Kategorien): 
MultinomialNB(alpha=0.1)

~58% auf Testset, leichte Schwankungen aufgrund shuffle bei Aufteilung Train-Test Set.

### Age  (3 Kategorien): 
LinearSVC(C=0.05)

~58% auf Testset, leichte Schwankungen aufgrund shuffle bei Aufteilung Train-Test Set.

### Gender-Age kombiniert (6 Kategorien): 
LinearSVC(C=0.2)

~37% auf Testset, leichte Schwankungen aufgrund shuffle bei Aufteilung Train-Test Set.

## Classifiers

In [10]:
def classify_age(text): 
    classifier = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(C=0.05)),
    ])
    fitted_classifier = classifier.fit(train_x_age, train_y_age)
    return fitted_classifier.predict(text)

def classify_gender(text):
    classifier = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB(alpha=0.1)),
    ])
    fitted_classifier = classifier.fit(train_x_gender, train_y_gender)
    return fitted_classifier.predict(text)

def classify_gender_age(text):
    classifier = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(C=0.2)),
    ])
    fitted_classifier = classifier.fit(train_x_genage, train_y_genage)
    return fitted_classifier.predict(text)

In [11]:
def print_statistics(class_name, true_values, predictions, classes):
    print(class_name, " accuracy: ", accuracy_score(true_values, predictions))
    print(nltk.ConfusionMatrix(true_values,predictions))
    print(metrics.classification_report(true_values, predictions, classes))   

## Classifiers testen

### Resultate mit Testset (= 20% der Daten)

In [12]:
# Test age
test_y_age_predicted = classify_age(test_x_age)
print_statistics("Age", test_y_age, test_y_age_predicted.tolist(), ['10s','20s', '30s'])

# Test gender
test_y_gender_predicted = classify_gender(test_x_gender)
print_statistics("Gender", test_y_gender, test_y_gender_predicted.tolist(), ['female', 'male'])

# Test gender-age
test_y_genage_predicted = classify_gender_age(test_x_genage)
print_statistics("Gender-Age", test_y_genage, test_y_genage_predicted.tolist(), 
                 ['female-10s','female-20s', 'female-30s','male-10s','male-20s', 'male-30s'])

Age  accuracy:  0.5876
    |    1    2    3 |
    |    0    0    0 |
    |    s    s    s |
----+----------------+
10s |  <34> 409 2748 |
20s |    5<1938>4005 |
30s |   13 1068<9780>|
----+----------------+
(row = reference; col = test)

              precision    recall  f1-score   support

         10s       0.65      0.01      0.02      3191
         20s       0.57      0.33      0.41      5948
         30s       0.59      0.90      0.71     10861

   micro avg       0.59      0.59      0.59     20000
   macro avg       0.60      0.41      0.38     20000
weighted avg       0.59      0.59      0.51     20000

Gender  accuracy:  0.57855
       |    f      |
       |    e      |
       |    m    m |
       |    a    a |
       |    l    l |
       |    e    e |
-------+-----------+
female |<5128>4472 |
  male | 3957<6443>|
-------+-----------+
(row = reference; col = test)

              precision    recall  f1-score   support

      female       0.56      0.53      0.55      9600
    

### Resultate mit Testset Dozent

#### - Daten einlesen und bereinigen um Trainingsformat einzuhalten (ohne stopwords, stemming ...)
#### - Kategorie Gender und Age zusammenlegen für gemeinsame Klassifiezierung

In [13]:
test_messages = pd.read_csv('dummy_test.csv', sep=',')
# Test data has to be cleansed to match trained data (stemming, stopwords..)
cleansed_text_professor = cleanse_text(test_messages['text']) # this takes some time

# combine gender and age categories, total 6 categories
genage_prof = [ '-'.join((g, a)) for g, a in zip(test_messages['person_gender'], test_messages['person_age'])]
print(test_messages['person_gender'][0], test_messages['person_age'][0], genage_prof[0],cleansed_text_professor[0][:80] )


(.) means 1'000 records processed
.
Total words available:  6695
Different words:  1969
Additional words to remove:  ['br', 'http', 'also', 'get', 'use', 'make', 'time', 'drill', 'stock', 'one', 'floor', 'way', 'market', 'like', 'would', 'day', 'boat', 'could', 'go', 'even']
.
Total words remaining:  5748
female 10s female-10s determin purchas sever method come goal work resid trick although away thing red


### Test ausführen und Statistik drucken

In [14]:
test_ages = classify_age(cleansed_text_professor)
print_statistics("Age", test_messages['person_age'].tolist(), test_ages.tolist(), ['10s','20s', '30s'])

test_genders = classify_gender(cleansed_text_professor)
print_statistics("Gender", test_messages['person_gender'].tolist(), test_ages.tolist(), ['female', 'male'])

test_genage = classify_gender_age(cleansed_text_professor)
print_statistics("Gender-Age", genage_prof, test_genage.tolist(), 
                ['female-10s','female-20s', 'female-30s','male-10s','male-20s', 'male-30s'])

Age  accuracy:  0.26666666666666666
    |  1  2  3 |
    |  0  0  0 |
    |  s  s  s |
----+----------+
10s | <.> 3  7 |
20s |  . <.>10 |
30s |  .  2 <8>|
----+----------+
(row = reference; col = test)

              precision    recall  f1-score   support

         10s       0.00      0.00      0.00        10
         20s       0.00      0.00      0.00        10
         30s       0.32      0.80      0.46        10

   micro avg       0.27      0.27      0.27        30
   macro avg       0.11      0.27      0.15        30
weighted avg       0.11      0.27      0.15        30



  'precision', 'predicted', average, warn_for)


Gender  accuracy:  0.0
       |        f    |
       |        e    |
       |        m  m |
       |  2  3  a  a |
       |  0  0  l  l |
       |  s  s  e  e |
-------+-------------+
   20s | <.> .  .  . |
   30s |  . <.> .  . |
female |  3 12 <.> . |
  male |  2 13  . <.>|
-------+-------------+
(row = reference; col = test)

              precision    recall  f1-score   support

      female       0.00      0.00      0.00        15
        male       0.00      0.00      0.00        15

   micro avg       0.00      0.00      0.00        30
   macro avg       0.00      0.00      0.00        30
weighted avg       0.00      0.00      0.00        30



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Gender-Age  accuracy:  0.13333333333333333
           | f f f       |
           | e e e       |
           | m m m m m m |
           | a a a a a a |
           | l l l l l l |
           | e e e e e e |
           | - - - - - - |
           | 1 2 3 1 2 3 |
           | 0 0 0 0 0 0 |
           | s s s s s s |
-----------+-------------+
female-10s |<.>. 2 . 2 1 |
female-20s | .<.>2 . . 3 |
female-30s | . .<3>. 1 1 |
  male-10s | . . 3<.>1 1 |
  male-20s | . . 4 .<.>1 |
  male-30s | . . 3 . 1<1>|
-----------+-------------+
(row = reference; col = test)

              precision    recall  f1-score   support

  female-10s       0.00      0.00      0.00         5
  female-20s       0.00      0.00      0.00         5
  female-30s       0.18      0.60      0.27         5
    male-10s       0.00      0.00      0.00         5
    male-20s       0.00      0.00      0.00         5
    male-30s       0.12      0.20      0.15         5

   micro avg       0.13      0.13      0.13        30
   mac

## Nützliche Ressourcen

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html