# 1. Single city testing

## 1.1. Learing about datasets

In [227]:
import datetime

import itertools

import pandas as pd
import datefinder as datef

In [228]:
def read_dataframe(src: str, **args):
    return pd.read_csv(f'../etc/{src}', **args).rename(columns={
        'File name': '_id',
        'row ID': '_id'
    })

In [229]:
def read_city(city: str):
    return {
        'train': read_dataframe(f'{city}Train.csv'),
        'test': read_dataframe(f'{city}Test.csv'),
        'mdf': read_dataframe(f'{city}/Twitter/manuallyDefinedTextFeatures.csv'),
        'liwc': read_dataframe(f'{city}/Twitter/LIWCFeatures.csv'),
        'lda': read_dataframe(f'{city}/Twitter/LDA50Features.csv'),
        'lda_topics': read_dataframe(f'{city}/Twitter/descriptions/LDA50FeaturesTopicTerms.csv', encoding='unicode_escape'),
        'image': read_dataframe(f'{city}/Instagram/imageConceptsFeatures.csv'),
        'fs': read_dataframe(f'{city}/Foursquare/venueCategoriesFeatures3MonthsTrain.csv')
    }

In [230]:
sgpr = read_city('Singapore')

In [231]:
sgpr['test'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 10 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   _id                                 222 non-null    object
 1   educationInfoForAgeGroupEstimation  132 non-null    object
 2   workInfoForAgeGroupEstimation       169 non-null    object
 3   gender                              222 non-null    object
 4   realAge                             222 non-null    int64 
 5   ageGroup                            222 non-null    object
 6   relationship                        118 non-null    object
 7   educationLevel                      112 non-null    object
 8   occupation                          101 non-null    object
 9   income                              17 non-null     object
dtypes: int64(1), object(9)
memory usage: 17.5+ KB


In [232]:
sgpr['test']

Unnamed: 0,_id,educationInfoForAgeGroupEstimation,workInfoForAgeGroupEstimation,gender,realAge,ageGroup,relationship,educationLevel,occupation,income
0,04aeb24cba16fe4550eab94b1f2720ce,taska kemas Class of 1992 � Kulai,Bacteriostatic Guitarist � 2011 to present � K...,male,27,AGE20_30,married,,management,
1,afd30f6d8587d96f6c53c131611c72c0,semashur Class of 2003 � Ulu Yam Baharu Pusat ...,Hospital Serdang House Officer � 4 January 201...,male,28,AGE20_30,single,undergraduate,protective service,$
2,31c71ec6ba45857482f5a72874049019,Kaplan Professional - Singapore Mass Communica...,The Chupitos Bar Waitress Best Fries Forever,female,21,AGE20_30,,school,food preparation and service related,
3,b999b0cdf2cd2b455f2a4d2ea90335b3,Bukit View Secondary School Class of 2008 � Si...,"Singapore, Singapore October 1992 to present",female,22,AGE20_30,,college,,$$
4,7a4430663de0d579be41ca6eb983e79a,,,female,90,AGE50_INF,,,,
...,...,...,...,...,...,...,...,...,...,...
217,84d9974cf35881118d7bcbe888b4332d,"Orchid Park Secondary School Yishun, Singapore...",Club 21 - club21global.com StarHub Management ...,female,21,AGE20_30,,college,,
218,8ef424205a7e5f8d8a2dce0f4a284224,,D&#039;Kranji Farm Resort December 2014 to pre...,female,20,AGE20_30,single,,,
219,7bc23d1f754765638a81bafb87a06591,,Singapore Armed Forces 10 December 2014 to pre...,male,21,AGE20_30,,,"arts, design, entertainment, sports, and media",
220,cb8ac80d445a98fea1908c0ef4ad9fcb,Seng Kang Secondary School Singapore Temasek P...,Isetan Singapore Sales � 2013 to present,female,20,AGE20_30,single,college,sales and related,


In [233]:
sgpr['test']['educationLevel'].unique()

array([nan, 'undergraduate', 'school', 'college'], dtype=object)

In [234]:
sgpr['liwc'].head(5)

Unnamed: 0,_id,WC,WPS,Qmarks,Unique,Dic,Sixltr,funct,pronoun,ppron,...,work,achieve,leisure,home,money,relig,death,assent,nonfl,filler
0,e9d39ae73f017eace2440900fb01ad30,3114.0,57.67,0.48,34.81,61.95,19.27,33.04,4.24,2.31,...,1.54,1.41,4.66,2.6,0.67,0.26,0.26,0.16,0.1,0.19
1,98f7b7f140770469440a5197865ad44d,3078.0,11.36,1.4,33.37,78.2,12.54,45.32,11.18,7.8,...,1.23,1.4,1.92,0.19,0.52,0.29,0.19,1.17,0.06,0.13
2,2280a0236964e86d15bb876b1d012e0c,1519.0,10.7,0.53,50.16,75.05,17.38,41.67,7.97,4.81,...,2.3,1.71,2.11,0.72,1.25,0.13,0.07,0.59,0.07,0.46
3,0aa283e80a7fa5b3ab9c5fcd5af6bcb1,10031.0,12.52,1.37,25.36,80.83,13.99,50.87,15.22,10.35,...,1.01,1.1,1.26,0.47,0.54,0.49,0.23,1.64,0.16,0.42
4,263138a8b987787df892aaafe081a2d6,810.0,15.0,0.25,49.38,74.94,17.16,48.4,10.74,6.42,...,1.23,1.73,1.48,0.37,0.86,0.37,0.0,0.0,0.0,0.25


In [235]:
sgpr['lda'].head(5)

Unnamed: 0,_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_40,topic_41,topic_42,topic_43,topic_44,topic_45,topic_46,topic_47,topic_48,topic_49
0,3c85ed7780f49b22e654fbeaa87523fc,0.000534,0.001472,0.000219,2.5e-05,0.413245,0.000765,0.021634,4.2e-05,0.000746,...,0.000422,0.000978,0.113929,0.000539,0.000135,0.007578,0.000831,0.000171,0.001501,0.000186
1,b7eb9e9fc4957568852cd390ea39a5f4,0.002191,0.006042,0.000897,0.000102,0.013234,0.003143,0.001751,0.000171,0.003061,...,0.001733,0.004015,0.003366,0.002212,0.000555,0.00209,0.003413,0.000701,0.006163,0.000764
2,e9a76a5383fb127fa24f32322273802b,0.000249,0.000686,0.000102,1.2e-05,0.363993,0.000357,0.000199,1.9e-05,0.000348,...,0.000197,0.072954,0.000382,0.003547,6.3e-05,0.000237,0.000387,8e-05,0.0007,8.7e-05
3,99f4a68f29e4a4c59a57f637fb80dc66,0.021316,0.004126,0.000613,7e-05,0.009037,0.002146,0.001195,0.000117,0.002091,...,0.001184,0.002742,0.002299,0.001511,0.000379,0.001427,0.00233,0.000479,0.004209,0.000522
4,41e3027469521674a8fec819a75f010f,0.000125,0.132915,5.1e-05,6e-06,0.21121,0.003494,0.18404,1e-05,0.000175,...,9.9e-05,0.026743,0.010135,0.000126,0.009974,0.03989,0.000195,4e-05,0.000352,4.4e-05


In [236]:
sgpr['image'].head(5)

Unnamed: 0,_id,numberOfImages,i1,i2,i3,i4,i5,i6,i7,i8,...,i991,i992,i993,i994,i995,i996,i997,i998,i999,i1000
0,8050cd99ddb796c99eaa8ddbd0219cde,39,13.317423,15.076338,17.179223,15.131572,14.863753,12.888127,13.211238,15.808906,...,15.716843,16.092305,15.055288,15.146896,14.893339,16.584632,15.621156,17.628895,15.053594,15.896505
1,af9cd9796e906d69e5394dc00f0ece37,7,2.319559,2.662407,2.985568,2.786017,2.705456,2.714111,2.376922,2.68915,...,2.704231,2.866833,2.615928,2.827796,2.857704,2.910568,2.755289,3.030762,2.823584,2.866577
2,128457b6e455e48960e09fc0d546ec37,70,25.023548,25.936653,29.111209,26.696439,26.637954,25.117927,24.527752,26.243793,...,26.96856,28.089201,25.116425,30.025714,27.804933,28.621096,26.875719,26.774403,27.141467,28.681892
3,0746a3d8ba79bb64c177da3519351dee,8,2.551922,3.1611,3.286251,2.965437,2.84599,2.508637,2.652265,3.137161,...,3.18301,3.294753,3.428294,3.072773,3.122732,3.058485,3.029626,3.274966,2.841433,3.097695
4,59f124c113fcf4f76a0b62eb4b39356f,286,103.061359,101.361321,117.605604,108.484895,103.04154,104.697371,100.462325,108.792497,...,115.348743,113.192499,103.049083,111.284439,105.304714,118.635131,113.952439,112.004387,111.038684,117.18131


In [237]:
sgpr['fs'].head(5)

Unnamed: 0,_id,categoryMentions,f1,f2,f3,f4,f5,f6,f7,f8,...,f755,f756,f757,f758,f759,f760,f761,f762,f763,f764
0,70467a1614f271f06f2637c1b10e163e,76,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,8050cd99ddb796c99eaa8ddbd0219cde,23,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,af9cd9796e906d69e5394dc00f0ece37,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,128457b6e455e48960e09fc0d546ec37,41,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0746a3d8ba79bb64c177da3519351dee,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 1.2. Flexing dataframes

In [238]:
def find_dates(s: str):
    s.replace('present', '2024')
    return datef.find_dates(s)
    
def earliest_year(s: str):
    if type(s) != str:
        return None
    date = find_dates(s)
    try:
        date = min(*find_dates(s))
    except TypeError:
        pass
    return date.year if type(date) is datetime.datetime else None

def latest_year(s: str):
    if type(s) != str:
        return None
    date = find_dates(s)
    try:
        date = max(*find_dates(s))
    except TypeError:
        pass
    return date.year if type(date) is datetime.datetime else None

def income_value(s: str):
    return s.count('$') if type(s) is str else 0
    

In [239]:
def main_preprocess(df: pd.DataFrame):
    df['educationEarliest'] = df['educationInfoForAgeGroupEstimation'].apply(earliest_year)
    df['educationLatest'] = df['educationInfoForAgeGroupEstimation'].apply(latest_year)
    
    df['workEarliest'] = df['workInfoForAgeGroupEstimation'].apply(earliest_year)
    df['workLatest'] = df['workInfoForAgeGroupEstimation'].apply(latest_year)
    
    df = df.drop(['educationInfoForAgeGroupEstimation', 'workInfoForAgeGroupEstimation'], axis=1)
    
    df['income'] = df['income'].apply(income_value)
    return df

In [240]:
main_preprocess(sgpr['test'])

Unnamed: 0,_id,gender,realAge,ageGroup,relationship,educationLevel,occupation,income,educationEarliest,educationLatest,workEarliest,workLatest
0,04aeb24cba16fe4550eab94b1f2720ce,male,27,AGE20_30,married,,management,0,,,,
1,afd30f6d8587d96f6c53c131611c72c0,male,28,AGE20_30,single,undergraduate,protective service,1,2003.0,2011.0,2012.0,2012.0
2,31c71ec6ba45857482f5a72874049019,female,21,AGE20_30,,school,food preparation and service related,0,,,,
3,b999b0cdf2cd2b455f2a4d2ea90335b3,female,22,AGE20_30,,college,,2,2008.0,2013.0,,
4,7a4430663de0d579be41ca6eb983e79a,female,90,AGE50_INF,,,,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
217,84d9974cf35881118d7bcbe888b4332d,female,21,AGE20_30,,college,,0,,,2011.0,2024.0
218,8ef424205a7e5f8d8a2dce0f4a284224,female,20,AGE20_30,single,,,0,,,,
219,7bc23d1f754765638a81bafb87a06591,male,21,AGE20_30,,,"arts, design, entertainment, sports, and media",0,,,2014.0,2014.0
220,cb8ac80d445a98fea1908c0ef4ad9fcb,female,20,AGE20_30,single,college,sales and related,0,,,,


In [241]:
def main_composite(cities: list[str]):
    train = pd.DataFrame()
    test = pd.DataFrame()
    twitter = pd.DataFrame()
    instagram = pd.DataFrame()
    foursquare = pd.DataFrame()
    
    for city in cities:
        dfs = read_city(city)
        
        dfs['train'] = main_preprocess(dfs['train'])
        dfs['test'] = main_preprocess(dfs['test'])
        
        tmp_twitter = pd.merge(dfs['lda'], dfs['liwc'], on='_id')
        tmp_twitter = pd.merge(tmp_twitter, dfs['mdf'], on='_id')
        
        train = pd.concat([train, dfs['train']], ignore_index=True)
        test = pd.concat([test, dfs['test']], ignore_index=True)
        twitter = pd.concat([twitter, tmp_twitter], ignore_index=True)
        instagram = pd.concat([instagram, dfs['image']], ignore_index=True)
        foursquare = pd.concat([foursquare, dfs['fs']])
        
    return train, test, twitter, instagram, foursquare

In [242]:
sgpr_train, sgpr_test, sgpr_tr, sgpr_ig, sgpr_fs = main_composite(['Singapore'])

## 1.4. Models for feature predictions

In [243]:
def encode(labels, train, test):
    from sklearn.preprocessing import LabelEncoder
    result = []
    
    for label in labels:
        le = LabelEncoder()
        train[label] = le.fit_transform(train[label])
        test[label] = le.transform(test[label])
        
        result.append(le)
        
    return result, train, test

def measure(train: pd.DataFrame, test: pd.DataFrame, features: pd.DataFrame, columns: list[str], model, metric):
    train = train.drop(['ageGroup'], axis=1)
    test = test.drop(['ageGroup'], axis=1)
    
    les, train, test = encode(['gender', 'educationLevel', 'relationship', 'occupation'], train, test)
    
    train = pd.merge(train, features, on='_id').dropna(subset=columns).drop('_id', axis=1)
    test = pd.merge(test, features, on='_id').dropna(subset=columns).drop('_id', axis=1)
    
    train_x = train.drop(columns, axis=1)
    train_y = train[columns]

    model.fit(train_x, train_y)

    test_x = test.drop(columns, axis=1)
    test_y = test[columns]
    
    test_p = model.predict(test_x)

    return metric(test_y, test_p)     

In [244]:
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, f1_score


In [245]:
def mean_measurer(y, p):
    result = {}
    for i, column in enumerate(y.columns):
        result[column] = mean_squared_error(y[column], [row[i] for row in p])
    
    return result

def regression(train, test, features, columns):
    return measure(train, test, features, columns, MultiOutputRegressor(HistGradientBoostingRegressor()), mean_measurer)

In [246]:
def f1_measurer(y, p):
    result = {}
    for i, column in enumerate(y.columns):
        result[column] = f1_score(y[column], [row[i] for row in p], average='micro')
    
    return result

def classification(train, test, features, columns):
    return measure(train, test, features, columns, MultiOutputClassifier(HistGradientBoostingClassifier(random_state=42)), f1_measurer)

## 1.5. Fusion while testing

In [247]:
def fusion(train, test, twitter, instagram, foursquare):
    features = [['T', twitter], ['I', instagram], ['F', foursquare]]
    results = {}
    
    for r in range(1, len(features) + 1):
        for combination in itertools.combinations(features, r):
            combined_features = None
            
            print(''.join(sorted([c[0] for c in combination])), end='-')
            
            for feature in [c[1] for c in combination]:
                if combined_features is None:
                    combined_features = feature
                else:
                    combined_features = pd.merge(combined_features, feature, on='_id')
            
            results[''.join(sorted([c[0] for c in combination]))] = {
                **regression(train, test, combined_features, ['realAge', 'income']),
                **classification(train, test, combined_features, ['educationLevel', 'relationship', 'occupation'])
            }
            
    styled = pd.DataFrame(results).T.style
    styled.set_caption('Fusion results')
    
    return styled

In [248]:
fusion(sgpr_train, sgpr_test, sgpr_tr, sgpr_ig, sgpr_fs)

T-I-

KeyboardInterrupt: 

# 2. Multiple cities testing

In [None]:
fusion(*main_composite(['Singapore', 'NewYork', 'London']))