In [2]:
import sklearn as sk
import spacy
nlp = spacy.blank("en")
import nltk
import re
import pandas as pd
import numpy as np
from collections import defaultdict as dd
from sklearn.feature_extraction.text import CountVectorizer
from processing_functions import Preprocess as process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
STOP = stopwords.words('english')
from textblob import TextBlob
from textblob import Word

CATEGORIES = ["14-16", "24-26", "34-36", "44-46","?"]
CATEGORIES_INDEX = [0,1,2,3,4]

In [3]:
def read(file):
    df = pd.read_csv("./COMP30027_2018S1_proj2-data/"+file+".csv",header=None)
    return df
def apply_categorise_targets(df):
        df = df.apply(categorise_targets)
        return df
    
def categorise_targets(num):
    if num <= 16:
        return 0
    if (num>=24) and (num<=26):
        return 1
    if (num>=34) and (num<=36):
        return 2
    if (num>=44) and (num<=46):
        return 3
    else:
        return 4
def read_and_output_train_test_split(train, test):
    
    train_raw = read(train)
    test_raw = read(test)
    
    train_raw = train_raw.loc[:, [0,6,2]]
    
    train_raw.columns = ["id","document", "age"]
    
    test_raw = test_raw.loc[:, [0,6,2]]
    
    test_raw.columns = ["id","document", "age"]    
    
    train_text  = train_raw['document']
    test_text =  test_raw['document']
    train_target = train_raw['age']
    test_target = test_raw['age']
    
    train_target = train_target.apply(categorise_targets)
    test_target = test_target.apply(categorise_targets)
    
    return ((train_text, train_target),(test_text, test_target),(train_raw, test_raw))

def remove_non_alpha(string):
    string = re.sub(r'[^a-zA-Z]', "", string)
    return string

def tokenise(text):

    text = ' '.join(text.split())
    text = text.split(" ")
    
    # text = list(filter(lambda a: (a != ' '),text))
    # text = list(filter(lambda a: (a != ''),text))
    
    text = map(remove_non_alpha, text)
    text = list(filter(lambda a: (a != ''),text))
    
    return text

def to_str(string_list):
    return (' '.join(string_list))


def remove_stop_words_to_str(string_list):
    
    s1 = list(filter(lambda a: (nlp.vocab[a].is_stop != True),string_list))
                       
    string_list = list(filter(lambda a: (a != 'urlLink'),s1))
                       
    return (' '.join(string_list))




def clean(train_text):
    
    train_text = train_text.str.replace('[^\w\s]','')
    train_text = train_text.str.lower()
    
    stop = stopwords.words('english')
    train_text = train_text.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    freq = pd.Series(' '.join(train_text).split()).value_counts()[:10]
    freq = list(freq.index)
    train_text = train_text.apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    freq = pd.Series(' '.join(train_text).split()).value_counts()[-100:]
    freq = list(freq.index)
    
    train_text = train_text.apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    
    train_text = train_text.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    
    return train_text





# Split data up

In [4]:
data = read_and_output_train_test_split("train_raw", "dev_raw")

In [5]:
train_text = data[0][0]
train_target = data[0][1]

test_text = data[1][0]
test_target = data[1][1]


training_data = data[2][0]
test_data = data[2][1]

In [6]:
train_text=clean(train_text)

In [38]:
train_text

0                      Douglas Rushkoff's Frontline docu...
1                      I'm DSLless for 10 days.  I suppo...
2                      Crap.  I ordered the new Citizen ...
3                      I can't get enough of CBC's  urlL...
4                      In case anyone's sitting on the e...
5                       urlLink 'We reinvented culture t...
6                       urlLink There's a great article ...
7                      I'm the top left corner on Cory D...
8                      I've started a film blog:   urlLi...
9                       urlLink Great roadtrip article b...
10                      urlLink Daily Relay  is finally ...
11                     Here's what the Slashdot effect l...
12                      urlLink Home of the Underdogs  i...
13                      urlLink Mikel.org  pointed me to...
14                       This is incredible.  Scientists...
15                     There's a great little Chinese gr...
16                     Who'd have though

In [31]:
def train(train_text, train_target):
    # buliding a pipeline
    text_clf = Pipeline([
        ('vectorizer', TfidfVectorizer(stop_words=STOP, max_df=0.50)),
        ('clf', LogisticRegression())  
    ])
    text_clf.fit(train_text, train_target);

    return (text_clf)

# Train the text classifier

In [32]:
text_clf = train(train_text, train_target)

In [33]:
CONFIDENCE_THRESHOLD = 0.13
def make_kaggle_output(test_data, predicted_dict):
    for document in test_data.index:
        print ("3"+str(document+1)+','+str(CATEGORIES[predicted_dict[test_data['id'][document]]]))

def create_author_id_age_list():
    author_id_age_list = {}
    for index in test_data['id'].index:
        author_id_age_list[test_data['id'][index]] = {}

    return author_id_age_list

def fill_author_id_age_list(clf, test_text, author_id_age_list):
    
    for index_num in range(len(test_text)):
        
        
            
        X = test_text[index_num]
            
        if max((clf.predict_proba([X]))[0]) < CONFIDENCE_THRESHOLD:  
            
            try:
                author_id_age_list[test_data['id'][index_num]][4]+=1
            except KeyError:
                author_id_age_list[test_data['id'][index_num]].update({4:1})
            
        else:
            
            try:
                author_id_age_list[test_data['id'][index_num]][clf.predict([X])[0]]+=1
            except KeyError:
                author_id_age_list[test_data['id'][index_num]].update({clf.predict([X])[0]:1})
            
            
            
    return author_id_age_list



def get_predict_list(author_id_age_list, predict_list):
    
    for key, value in author_id_age_list.items():
        #print (key, value)
        predict_list.append(sorted(value, key=value.get, reverse = True)[0])
    return predict_list


def get_predict_dict(author_id_age_list, predict_dict):
    
    for key, value in author_id_age_list.items():
        
        predict_dict.update({key:sorted(value, key=value.get, reverse = True)[0]})
        
    return predict_dict

def test_target_to_unique_dict():
    # unique list of authors and their respective ages in index form (0->4)
    test_target_author_dict = {}
    for index in test_data['id'].index:
        if (test_data['id'][index]) not in list(test_target_author_dict.keys()):
            test_target_author_dict[test_data['id'][index]] = test_data['age'][index]

    test_target_author_series = pd.Series(test_target_author_dict)

    test_target_author_series=test_target_author_series.apply(categorise_targets)
    return test_target_author_series


# Compile all blog posts by unique user id

In [34]:
# predict once per id
author_id_age_list = create_author_id_age_list()
author_id_age_list=fill_author_id_age_list(text_clf, test_text, author_id_age_list)
predicted = []
predicted_dict = {}
predicted = get_predict_list(author_id_age_list, predicted)
predicted_dict = get_predict_dict(author_id_age_list, predicted_dict)

In [35]:
author_id_age_list

{11253: {0: 1, 1: 5, 4: 1},
 46465: {0: 6, 1: 8, 4: 5},
 99382: {0: 5, 1: 49, 4: 19},
 143781: {1: 5, 4: 1},
 172224: {1: 22, 4: 4},
 183164: {0: 1, 1: 9, 4: 2},
 195585: {0: 20, 1: 83, 4: 41},
 216442: {0: 5, 1: 58, 4: 19},
 278367: {1: 2, 4: 2},
 299143: {0: 38, 1: 215, 4: 72},
 307112: {0: 159, 1: 265, 3: 1, 4: 118},
 315751: {1: 28, 2: 4, 4: 18},
 318564: {0: 11, 1: 12, 4: 10},
 420325: {0: 1, 1: 36, 4: 2},
 472101: {0: 11, 1: 31, 4: 16},
 476109: {0: 1, 1: 40, 4: 11},
 483062: {1: 16, 4: 4},
 533593: {0: 1, 1: 16, 4: 1},
 535158: {0: 5, 1: 311, 2: 12, 3: 2, 4: 136},
 546941: {0: 1, 1: 10},
 562322: {0: 3, 1: 29, 2: 2, 4: 10},
 601845: {0: 1, 1: 1, 4: 2},
 620124: {0: 46, 1: 250, 2: 4, 4: 51},
 638526: {0: 6, 1: 1, 4: 5},
 639335: {0: 1, 1: 37, 4: 6},
 656452: {0: 1, 1: 7},
 661265: {1: 46, 2: 1, 4: 17},
 665284: {0: 14, 1: 90, 4: 26},
 729786: {0: 6, 1: 125, 4: 40},
 732730: {0: 33, 1: 11, 4: 9},
 743007: {1: 15, 4: 3},
 748386: {0: 1, 1: 26, 4: 5},
 751202: {0: 31, 1: 119, 4: 81}

# Model Accuracy on Development Set

In [37]:
np.mean(predicted==test_target_to_unique_dict())

0.60855043420173682

In [75]:
make_kaggle_output(test_data, predicted_dict)

31,24-26
32,24-26
33,24-26
34,24-26
35,24-26
36,24-26
37,24-26
38,24-26
39,24-26
310,24-26
311,24-26
312,24-26
313,24-26
314,24-26
315,24-26
316,24-26
317,24-26
318,24-26
319,24-26
320,24-26
321,24-26
322,24-26
323,24-26
324,24-26
325,24-26
326,24-26
327,24-26
328,24-26
329,24-26
330,24-26
331,24-26
332,24-26
333,24-26
334,24-26
335,24-26
336,24-26
337,24-26
338,24-26
339,24-26
340,24-26
341,24-26
342,24-26
343,24-26
344,24-26
345,24-26
346,24-26
347,24-26
348,24-26
349,24-26
350,24-26
351,24-26
352,24-26
353,24-26
354,24-26
355,24-26
356,24-26
357,24-26
358,24-26
359,24-26
360,24-26
361,24-26
362,24-26
363,24-26
364,24-26
365,24-26
366,24-26
367,24-26
368,24-26
369,24-26
370,24-26
371,24-26
372,24-26
373,24-26
374,24-26
375,24-26
376,24-26
377,24-26
378,24-26
379,24-26
380,24-26
381,24-26
382,24-26
383,24-26
384,24-26
385,24-26
386,24-26
387,24-26
388,24-26
389,24-26
390,24-26
391,24-26
392,24-26
393,24-26
394,24-26
395,24-26
396,24-26
397,24-26
398,24-26
399,24-26
3100,24-26
3101,24-

In [23]:
train_target

0         1
1         1
2         1
3         1
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        1
13        1
14        1
15        1
16        1
17        1
18        1
19        1
20        1
21        1
22        1
23        1
24        1
25        1
26        1
27        1
28        1
29        1
         ..
276385    1
276386    1
276387    1
276388    1
276389    1
276390    1
276391    0
276392    0
276393    1
276394    1
276395    0
276396    0
276397    0
276398    0
276399    0
276400    0
276401    0
276402    0
276403    0
276404    1
276405    1
276406    1
276407    1
276408    1
276409    1
276410    1
276411    1
276412    0
276413    0
276414    1
Name: age, Length: 276415, dtype: int64

In [44]:
predicted_dict

{11253: 1,
 46465: 1,
 99382: 1,
 143781: 1,
 172224: 1,
 183164: 1,
 195585: 1,
 216442: 1,
 278367: 4,
 299143: 1,
 307112: 1,
 315751: 1,
 318564: 1,
 420325: 1,
 472101: 1,
 476109: 1,
 483062: 1,
 533593: 1,
 535158: 1,
 546941: 1,
 562322: 1,
 601845: 4,
 620124: 1,
 638526: 0,
 639335: 1,
 656452: 1,
 661265: 1,
 665284: 1,
 729786: 1,
 732730: 0,
 743007: 1,
 748386: 1,
 751202: 1,
 791676: 1,
 817097: 1,
 819785: 1,
 823780: 1,
 825029: 0,
 855133: 1,
 878554: 1,
 880351: 0,
 880688: 1,
 890200: 1,
 892173: 0,
 906909: 1,
 918695: 0,
 944615: 1,
 970036: 1,
 979795: 1,
 980975: 0,
 1000866: 0,
 1021779: 1,
 1047241: 1,
 1056581: 1,
 1058543: 1,
 1084944: 1,
 1097617: 1,
 1108003: 1,
 1111991: 1,
 1131982: 1,
 1132409: 4,
 1135325: 1,
 1157338: 1,
 1171643: 1,
 1198592: 1,
 1205193: 0,
 1207422: 1,
 1221843: 1,
 1242759: 4,
 1259179: 1,
 1274564: 1,
 1274993: 1,
 1296098: 1,
 1296370: 0,
 1306027: 0,
 1315669: 1,
 1316855: 1,
 1336804: 1,
 1352859: 1,
 1361398: 1,
 1366447: 1,
