# Data aggregations
With the dataset we built for T2, there is still some data processing we need to do:
- Manually tag ads (as found [here](https://docs.google.com/spreadsheets/d/1YuU6g32nyDuk7oKvLV-U1rbYdVdM3fKGIxJ3mT2-8vs/edit#gid=0))
- Incorporate ads and images descriptions to the dataset (NLP - using embeddings)

In [1]:
import pandas as pd
import fasttext
from sklearn import preprocessing

In [2]:
t2_df = pd.read_csv('../../data/AllUsers_Ads_Ratings_df.csv')
t2_df.head()

  t2_df = pd.read_csv('../../data/AllUsers_Ads_Ratings_df.csv')


Unnamed: 0,UserId,AdId,Age,Cap/Zip-Code,Countries visited,Fave Sports,Gender,Home country,Home town,Income,...,fave8,fave9,unfave1,unfave2,unfave3,unfave4,unfave5,unfave6,AdFilePath,Rating
0,U0001,A01_01,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,../../data/ads16-dataset/ADS16_Benchmark_part1...,1.0
1,U0001,A01_02,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,../../data/ads16-dataset/ADS16_Benchmark_part1...,1.0
2,U0001,A01_03,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,../../data/ads16-dataset/ADS16_Benchmark_part1...,1.0
3,U0001,A01_04,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,../../data/ads16-dataset/ADS16_Benchmark_part1...,1.0
4,U0001,A01_05,62,15613,United States of America,I do not like Sports,F,United States of America,Apollo,1,...,,,news headlines,homelessness,violence,war,human rights,,../../data/ads16-dataset/ADS16_Benchmark_part1...,1.0


In [3]:
t2_df.columns

Index(['UserId', 'AdId', 'Age', 'Cap/Zip-Code', 'Countries visited',
       'Fave Sports', 'Gender', 'Home country', 'Home town', 'Income',
       'Last Name', 'Most listened musics', 'Most read books',
       'Most visited websites', 'Most watched movies',
       'Most watched tv programmes', 'Name', 'Paypal', 'Timepass',
       'Type of Job', 'Weekly working hours', 'fave1', 'fave10', 'fave2',
       'fave3', 'fave4', 'fave5', 'fave6', 'fave7', 'fave8', 'fave9',
       'unfave1', 'unfave2', 'unfave3', 'unfave4', 'unfave5', 'unfave6',
       'AdFilePath', 'Rating'],
      dtype='object')

## Favorite and unfavorite picture descriptions

In [4]:
t2_df['faves'] = t2_df[['fave1', 'fave2','fave3', 'fave4','fave5', 'fave6', 'fave7', 'fave8', 'fave9', 'fave10']].apply(lambda x: ' '.join(x.map(str)), axis = 1)
t2_df['unfaves'] = t2_df[['unfave1', 'unfave2','unfave3', 'unfave4','unfave5', 'unfave6']].apply(lambda x: ' '.join(x.map(str)), axis = 1)

In [5]:
t2_df.drop(columns = [
    'fave1', 
    'fave2',
    'fave3', 
    'fave4',
    'fave5', 
    'fave6', 
    'fave7', 
    'fave8', 
    'fave9', 
    'fave10',
    'unfave1', 
    'unfave2',
    'unfave3',
    'unfave4',
    'unfave5',
    'unfave6'
    ],
    axis = 1,
    inplace = True
)

In [6]:
sep = ' nan'
t2_df['faves'] = t2_df['faves'].apply(lambda x: x.split(sep, 1)[0])
t2_df['unfaves'] = t2_df['unfaves'].apply(lambda x: x.split(sep, 1)[0])

In [7]:
t2_df['faves']

0        my cats my cats movie we are in tv show we are...
1        my cats my cats movie we are in tv show we are...
2        my cats my cats movie we are in tv show we are...
3        my cats my cats movie we are in tv show we are...
4        my cats my cats movie we are in tv show we are...
                               ...                        
36115    nike Muhammad Ali House of Cards Peace Hidden ...
36116    nike Muhammad Ali House of Cards Peace Hidden ...
36117    nike Muhammad Ali House of Cards Peace Hidden ...
36118    nike Muhammad Ali House of Cards Peace Hidden ...
36119    nike Muhammad Ali House of Cards Peace Hidden ...
Name: faves, Length: 36120, dtype: object

### Embedding

In [8]:
def embedding_model(df_col, name):
    with open('../../data/' + name + '.txt', 'w') as f:
        for line in df_col.values:
            f.write(line)
            f.write('\n')
    
    model = fasttext.train_unsupervised('../../data/' + name + '.txt')

    t2_df[name + '_embeddings'] = df_col.apply(lambda x: model.get_sentence_vector(x))
    print(t2_df[name + '_embeddings'])

In [9]:
embedding_model(t2_df['faves'], 'faves')

Read 0M words
Number of words:  1432
Number of labels: 0
Progress: 100.0% words/sec/thread:   79635 lr:  0.000000 avg.loss:  0.331012 ETA:   0h 0m 0s


0        [0.0033652314, -0.09599134, 0.09924359, -0.039...
1        [0.0033652314, -0.09599134, 0.09924359, -0.039...
2        [0.0033652314, -0.09599134, 0.09924359, -0.039...
3        [0.0033652314, -0.09599134, 0.09924359, -0.039...
4        [0.0033652314, -0.09599134, 0.09924359, -0.039...
                               ...                        
36115    [-0.015357981, -0.13848145, -0.012106664, 0.00...
36116    [-0.015357981, -0.13848145, -0.012106664, 0.00...
36117    [-0.015357981, -0.13848145, -0.012106664, 0.00...
36118    [-0.015357981, -0.13848145, -0.012106664, 0.00...
36119    [-0.015357981, -0.13848145, -0.012106664, 0.00...
Name: faves_embeddings, Length: 36120, dtype: object


In [10]:
embedding_model(t2_df['unfaves'], 'unfaves')

Read 0M words
Number of words:  1155
Number of labels: 0
Progress: 100.0% words/sec/thread:  118996 lr:  0.000000 avg.loss:  0.399288 ETA:   0h 0m 0s


0        [-0.06216975, -0.034681756, -0.09033885, 0.077...
1        [-0.06216975, -0.034681756, -0.09033885, 0.077...
2        [-0.06216975, -0.034681756, -0.09033885, 0.077...
3        [-0.06216975, -0.034681756, -0.09033885, 0.077...
4        [-0.06216975, -0.034681756, -0.09033885, 0.077...
                               ...                        
36115    [-0.09590767, -0.044566523, -0.13932896, 0.176...
36116    [-0.09590767, -0.044566523, -0.13932896, 0.176...
36117    [-0.09590767, -0.044566523, -0.13932896, 0.176...
36118    [-0.09590767, -0.044566523, -0.13932896, 0.176...
36119    [-0.09590767, -0.044566523, -0.13932896, 0.176...
Name: unfaves_embeddings, Length: 36120, dtype: object


In [11]:
t2_df.drop(['faves', 'unfaves'], axis = 1, inplace = True)

In [12]:
t2_df.columns

Index(['UserId', 'AdId', 'Age', 'Cap/Zip-Code', 'Countries visited',
       'Fave Sports', 'Gender', 'Home country', 'Home town', 'Income',
       'Last Name', 'Most listened musics', 'Most read books',
       'Most visited websites', 'Most watched movies',
       'Most watched tv programmes', 'Name', 'Paypal', 'Timepass',
       'Type of Job', 'Weekly working hours', 'AdFilePath', 'Rating',
       'faves_embeddings', 'unfaves_embeddings'],
      dtype='object')

In [13]:
unnecessary_columns = [
    'UserId', 
    'Cap/Zip-Code',
    'Name', 
    'Paypal',
    'AdFilePath'
]
t2_df.drop(unnecessary_columns, axis = 1, inplace = True)

## Ad Tags
Manually made!

In [14]:
tags = pd.read_csv('../../data/tags.csv')
tags.head()

Unnamed: 0,AdId,accessories,alcohol,animamted,animated,antique,attractive,baby,banner,bar,...,used,wallet,wallets,warranty,white background,white-background,wholesale,wings,woman,work
0,A01_01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A01_02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A01_03,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
3,A01_04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A01_05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
t2_df['AdId']

0        A01_01
1        A01_02
2        A01_03
3        A01_04
4        A01_05
          ...  
36115    A20_11
36116    A20_12
36117    A20_13
36118    A20_14
36119    A20_15
Name: AdId, Length: 36120, dtype: object

In [16]:
tags['AdId']

0      A01_01
1      A01_02
2      A01_03
3      A01_04
4      A01_05
        ...  
296    A20_11
297    A20_12
298    A20_13
299    A20_14
300    A20_15
Name: AdId, Length: 301, dtype: object

In [17]:
full_df = pd.concat([t2_df, tags], axis = 1)

In [18]:
full_df.drop(['AdId'], axis = 1, inplace = True)

## Categorical features
One Hot Encoder for all that aren't multi-label.

In [19]:
categorical_features = [
    'Gender',
    'Home country',
    'Home town',
    'Timepass',
    'Type of Job',
    'Weekly working hours'
]

In [20]:
full_df = pd.get_dummies(full_df, columns = categorical_features)

In [21]:
full_df.columns

Index(['Age', 'Countries visited', 'Fave Sports', 'Income', 'Last Name',
       'Most listened musics', 'Most read books', 'Most visited websites',
       'Most watched movies', 'Most watched tv programmes',
       ...
       'Type of Job_Consulting', 'Type of Job_Contract employment',
       'Type of Job_Housewife/Househusband', 'Type of Job_Odd job',
       'Type of Job_Self-employment', 'Type of Job_Student',
       'Type of Job_Temporary', 'Type of Job_Unemploied',
       'Weekly working hours_Full Time', 'Weekly working hours_Part Time'],
      dtype='object', length=283)

### Multilabel

In [44]:
def multilabel_binarizer(df_col):
    line_classes = []
    for line in range(0, len(df_col)):
        data_classes = []
        for item in range(0, len(df_col[line].split(', '))):
            data_classes.append(df_col[line].split(', ')[item])
        line_classes.append(data_classes)
    binarizer = preprocessing.MultiLabelBinarizer()
    binarized_columns = pd.DataFrame(binarizer.fit_transform(line_classes), columns = binarizer.classes_)
    return binarized_columns

In [45]:
multilabel_binarizer(full_df['Countries visited'])

Unnamed: 0,Antigua &amp; Barbuda,Argentina,Aruba,Australia,Austria,Bahamas,Bahrain,Barbados,Belgium,Belize,...,Taiwan,Thailand,Trinidad &amp; Tobago,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States of America,Vietnam,Virgin Islands (USA)
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36117,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
multilabel_columns = [
    'Countries visited',
    'Fave Sports',
    'Most listened musics', 
    'Most read books',
    'Most visited websites',
    'Most watched movies',
    'Most watched tv programmes'
]

In [48]:
for col in multilabel_columns:
    new_cols = multilabel_binarizer(full_df[col])
    full_df = pd.concat([full_df, new_cols], axis = 1)

In [49]:
full_df.columns

Index(['Age', 'Countries visited', 'Fave Sports', 'Income', 'Last Name',
       'Most listened musics', 'Most read books', 'Most visited websites',
       'Most watched movies', 'Most watched tv programmes',
       ...
       'Comedy', 'Drama', 'Entertainment (Variety Shows)', 'Factual',
       'Learning', 'Music', 'News', 'Religion &amp; Ethics', 'Sport',
       'Weather'],
      dtype='object', length=520)

In [50]:
full_df.drop(multilabel_columns, inplace = True, axis = 1)

## Binarize Rating
Finally, the last step is to make the y variable a categorical one.

In [52]:
full_df['Rating_bin'] = full_df['Rating'].apply(lambda x: 1 if x >= 4 else 0)

In [55]:
full_df.drop('Rating', axis = 1, inplace = True)

In [56]:
full_df.to_csv('../../data/final_features_df.csv')