In [1]:
# Use the following features to derive new features:
# lang (en, und, tt) ... convert to a bool for each language
# to_user_name -> bool for conversational tweet
# from_user_lang .. alternative to loang ()probably not needed)
# source -> contains "instagram.com" becomes is_instagram
# tweet_text_contains_url -> needs a regex (google) or just http?
# also need to filter out null LGA as they are useless to us
# then need to summarise the new features per LGA

In [2]:
import numpy as np
import pandas as pd
import os
import re
import string
base = "../../data/clean/"
pd.set_option('display.max_rows', 150)

In [29]:
tweetdf = pd.read_csv(base+'loctrisma2016.csv')
len(tweetdf)

814398

In [30]:
lga_tweets_df = tweetdf[['lga','tweet_id']].groupby(['lga']).count().reset_index()
lga_tweets_df.columns = ['lga','total_tweets']
lga_tweets_df

Unnamed: 0,lga,total_tweets
0,Albury,20942
1,Armidale Regional,5997
2,Ballina,5463
3,Balranald,173
4,Bathurst Regional,6168
5,Bayside,5007
6,Bega Valley,4693
7,Bellingen,1532
8,Berrigan,1247
9,Blacktown,2675


In [31]:
lga_tweeters_df = tweetdf[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
lga_tweeters_df.columns = ['lga','total_tweeters']
lga_tweeters_df


Unnamed: 0,lga,total_tweeters
0,Albury,775
1,Armidale Regional,306
2,Ballina,698
3,Balranald,34
4,Bathurst Regional,608
5,Bayside,770
6,Bega Valley,479
7,Bellingen,196
8,Berrigan,102
9,Blacktown,348


In [38]:
derived_df = pd.merge(lga_tweets_df, lga_tweeters_df, on='lga')
derived_df

Unnamed: 0,lga,total_tweets,total_tweeters
0,Albury,20942,775
1,Armidale Regional,5997,306
2,Ballina,5463,698
3,Balranald,173,34
4,Bathurst Regional,6168,608
5,Bayside,5007,770
6,Bega Valley,4693,479
7,Bellingen,1532,196
8,Berrigan,1247,102
9,Blacktown,2675,348


In [33]:
demographics_df = pd.read_csv('../../data/clean/2016_demographics.csv')
demographics_df.rename(columns={'Tot_P_P_G01': 'population'}, inplace=True)
demographics_df

Unnamed: 0.1,Unnamed: 0,lga,OPDs_Separate_house_Dwellings_G32,OPDs_Separate_house_Persons_G32,OPDs_SD_r_t_h_th_1_sty_Dwgs_G32,OPDs_SD_r_t_h_th_1_sty_Psns_G32,OPDs_SD_r_t_h_th_2_m_sty_Dwgs_G32,OPDs_SD_r_t_h_th_2_m_sty_Psns_G32,OPDs_SD_r_t_h_th_Tot_Dwgs_G32,OPDs_SD_r_t_h_th_Tot_Psns_G32,...,Part_inc_stated_Fam_household_G29,Part_inc_std_Non_fam_hhds_G29,Partial_income_stated_Tot_G29,All_incomes_ns_Famly_househld_G29,All_inc_ns_Non_famly_househld_G29,All_incomes_not_stated_Tot_G29,Tot_Family_households_G29,Tot_Non_family_households_G29,Tot_Tot_G29,area
0,0,Albury,15634,39601,2907,4658,416,675,3324,5334,...,1136,144,1280,236,320,554,12674,6824,19495,305.9459
1,1,Armidale Regional,8689,22010,771,1404,144,240,910,1637,...,680,89,765,116,186,299,6803,3469,10276,8620.699
2,2,Ballina,11223,29084,2129,3930,1486,2824,3612,6757,...,994,119,1118,179,311,488,11067,5156,16220,484.9389
3,3,Balranald,734,1907,15,23,3,3,22,24,...,83,6,86,18,24,39,563,240,807,21690.6753
4,4,Bathurst Regional,12166,32367,1137,1974,327,583,1460,2559,...,942,108,1054,164,214,377,9934,4478,14411,3817.8646
5,5,Bayside,21535,67356,4479,10788,3150,8947,7632,19731,...,3656,580,4231,467,615,1078,38482,15896,54377,49.8904
6,6,Bega Valley,10934,25770,834,1296,499,845,1336,2140,...,976,52,1033,144,257,399,8873,4344,13222,6278.8811
7,7,Bellingen,4466,10684,184,339,53,112,233,449,...,368,21,388,48,103,151,3370,1603,4970,1600.4337
8,8,Berrigan,2904,6728,107,139,0,0,107,139,...,199,12,216,51,97,155,2103,1159,3265,2065.7759
9,9,Blacktown,78774,257809,6231,18356,10055,32142,16286,50499,...,7193,390,7586,1163,1005,2168,83375,17730,101107,240.0487


In [34]:
# Baseline excludes LGA with no valid crime statistics:
base_dem_df = pd.read_csv('../../data/clean/baseline_demographics.csv')
demographics_df = pd.merge(demographics_df[['lga','population']],base_dem_df,left_on='lga', right_on='clean_name')
demographics_df = demographics_df[['lga','population']]
demographics_df

Unnamed: 0,lga,population
0,Albury,51076
1,Armidale Regional,29449
2,Ballina,41790
3,Bathurst Regional,41300
4,Bayside,156058
5,Bega Valley,33253
6,Bellingen,12668
7,Berrigan,8462
8,Blacktown,336962
9,Bland,5955


In [51]:
# Unfortunately the demographics CSV uses "greater Hume Shire" will trsima csv uses greater hum. So update demographics:
demographics_df.iloc[41,0] = 'Greater Hume'

In [52]:
derived_df = pd.merge(demographics_df[['lga','population']], derived_df,on='lga',how='left')
derived_df['tweets_per_capita'] = derived_df['total_tweets'] / derived_df['population']
derived_df['tweeters_per_capita'] = derived_df['total_tweeters'] / derived_df['population']
derived_df

Unnamed: 0,lga,population,total_tweets,total_tweeters,tweets_per_capita,tweeters_per_capita
0,Albury,51076,20942,775,0.410016,0.015173
1,Armidale Regional,29449,5997,306,0.20364,0.010391
2,Ballina,41790,5463,698,0.130725,0.016703
3,Bathurst Regional,41300,6168,608,0.149346,0.014722
4,Bayside,156058,5007,770,0.032084,0.004934
5,Bega Valley,33253,4693,479,0.14113,0.014405
6,Bellingen,12668,1532,196,0.120935,0.015472
7,Berrigan,8462,1247,102,0.147365,0.012054
8,Blacktown,336962,2675,348,0.007939,0.001033
9,Bland,5955,235,31,0.039463,0.005206


In [53]:
# Language booleans
langdf = tweetdf.lang.str.get_dummies()
langs = langdf.columns.tolist()
langdf = pd.concat([tweetdf,langdf],axis=1)[['lga','from_user_id']+langs]
print(langdf)

                                 lga  from_user_id  ar  bg  bn  bo  cs  cy  \
0                      Central Coast    4721717942   0   0   0   0   0   0   
1                        Eurobodalla    4721717942   0   0   0   0   0   0   
2                             Dungog    4721717942   0   0   0   0   0   0   
3            Port Macquarie-Hastings    4721717942   0   0   0   0   0   0   
4                            Lithgow      40962229   0   0   0   0   0   0   
5                     Lake Macquarie    4721717942   0   0   0   0   0   0   
6                             Temora    4721717942   0   0   0   0   0   0   
7                              Cowra    4721717942   0   0   0   0   0   0   
8                           Nambucca    4721717942   0   0   0   0   0   0   
9                            Lachlan    4721717942   0   0   0   0   0   0   
10                   Clarence Valley    4721717942   0   0   0   0   0   0   
11      Queanbeyan-Palerang Regional    4721717942   0   0   0  

In [54]:
DROP_COUNT=len(tweetdf) / 500
# Ignore undetermined, visual inspection shows it is normally just hashtags/english:
lga_langdf = langdf.drop(['und','from_user_id'],axis=1).groupby(['lga']).sum().reset_index()
sums = langdf.drop(['lga','from_user_id'],axis=1).sum().to_dict()
to_drop = [k for (k,v) in sums.items() if k == 'en' or v < DROP_COUNT]
lga_langdf = lga_langdf.drop(to_drop, axis=1)
lga_langdf

Unnamed: 0,lga,es,fr,in,ja,tl,tr
0,Albury,28,39,38,3,66,4
1,Armidale Regional,11,11,12,2,52,0
2,Ballina,7,5,11,6,14,0
3,Balranald,0,0,2,0,0,0
4,Bathurst Regional,13,14,7,4,12,0
5,Bayside,13,13,12,31,44,10
6,Bega Valley,27,5,25,0,52,1
7,Bellingen,0,3,4,1,20,0
8,Berrigan,11,5,13,0,8,1
9,Blacktown,4,3,26,1,33,2


Language counts will probably not be useful unless converted to speaker count. Even so the twitter language classifer is suspect. As such we would probably need to choose specific languages of interest rather than dynmically choosing them as in the code above. Furthermore the distribution across LGAs does not seem likely to be predictive. Unique tweeters per language may be of marginally more interest. Given there is also a risk this will be considered to be a form of ethnic profiling it is probably safer to leave these features out.
It is somewhat surprising that the most common languages apart from english and undetermined are es (spanish), fr (french), in (indonesian/malay), tl (tagalog/phillipines) and tr (turkish).

In [55]:
es_df = langdf[langdf['es']==1]
es_tweeters_df = es_df[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
es_tweeters_df.columns = ['lga','es_tweeters']
es_tweeters_df

Unnamed: 0,lga,es_tweeters
0,Albury,20
1,Armidale Regional,4
2,Ballina,6
3,Bathurst Regional,8
4,Bayside,6
5,Bega Valley,8
6,Berrigan,3
7,Blacktown,3
8,Blayney,1
9,Blue Mountains,17


In [56]:
fr_df = langdf[langdf['fr']==1]
fr_tweeters_df = fr_df[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
fr_tweeters_df.columns = ['lga','fr_tweeters']
fr_tweeters_df

Unnamed: 0,lga,fr_tweeters
0,Albury,24
1,Armidale Regional,7
2,Ballina,5
3,Bathurst Regional,9
4,Bayside,8
5,Bega Valley,4
6,Bellingen,3
7,Berrigan,2
8,Blacktown,2
9,Blue Mountains,44


In [57]:
in_df = langdf[langdf['in']==1]
in_tweeters_df = in_df[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
in_tweeters_df.columns = ['lga','in_tweeters']
in_tweeters_df

Unnamed: 0,lga,in_tweeters
0,Albury,26
1,Armidale Regional,5
2,Ballina,10
3,Balranald,1
4,Bathurst Regional,4
5,Bayside,10
6,Bega Valley,18
7,Bellingen,3
8,Berrigan,5
9,Blacktown,7


In [58]:
ja_df = langdf[langdf['ja']==1]
ja_tweeters_df = ja_df[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
ja_tweeters_df.columns = ['lga','ja_tweeters']
ja_tweeters_df

Unnamed: 0,lga,ja_tweeters
0,Albury,3
1,Armidale Regional,1
2,Ballina,4
3,Bathurst Regional,1
4,Bayside,6
5,Bellingen,1
6,Blacktown,1
7,Blayney,1
8,Blue Mountains,12
9,Broken Hill,1


In [59]:
tl_df = langdf[langdf['tl']==1]
tl_tweeters_df = tl_df[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
tl_tweeters_df.columns = ['lga','tl_tweeters']
tl_tweeters_df

Unnamed: 0,lga,tl_tweeters
0,Albury,29
1,Armidale Regional,5
2,Ballina,10
3,Bathurst Regional,7
4,Bayside,14
5,Bega Valley,19
6,Bellingen,7
7,Berrigan,2
8,Blacktown,9
9,Blue Mountains,44


In [60]:
tr_df = langdf[langdf['tr']==1]
tr_tweeters_df = tr_df[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
tr_tweeters_df.columns = ['lga','tr_tweeters']
tr_tweeters_df

Unnamed: 0,lga,tr_tweeters
0,Albury,4
1,Bayside,1
2,Bega Valley,1
3,Berrigan,1
4,Blacktown,2
5,Blue Mountains,1
6,Broken Hill,1
7,Burwood,1
8,Byron,4
9,Canada Bay,1


In [61]:
lga_langdf = pd.merge(lga_langdf, es_tweeters_df, on='lga', how='outer')
lga_langdf = pd.merge(lga_langdf, fr_tweeters_df, on='lga', how='outer')
lga_langdf = pd.merge(lga_langdf, in_tweeters_df, on='lga', how='outer')
lga_langdf = pd.merge(lga_langdf, ja_tweeters_df, on='lga', how='outer')
lga_langdf = pd.merge(lga_langdf, tl_tweeters_df, on='lga', how='outer')
lga_langdf = pd.merge(lga_langdf, tr_tweeters_df, on='lga', how='outer')
lga_langdf = lga_langdf.fillna(0)
lga_langdf

Unnamed: 0,lga,es,fr,in,ja,tl,tr,es_tweeters,fr_tweeters,in_tweeters,ja_tweeters,tl_tweeters,tr_tweeters
0,Albury,28,39,38,3,66,4,20.0,24.0,26.0,3.0,29.0,4.0
1,Armidale Regional,11,11,12,2,52,0,4.0,7.0,5.0,1.0,5.0,0.0
2,Ballina,7,5,11,6,14,0,6.0,5.0,10.0,4.0,10.0,0.0
3,Balranald,0,0,2,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
4,Bathurst Regional,13,14,7,4,12,0,8.0,9.0,4.0,1.0,7.0,0.0
5,Bayside,13,13,12,31,44,10,6.0,8.0,10.0,6.0,14.0,1.0
6,Bega Valley,27,5,25,0,52,1,8.0,4.0,18.0,0.0,19.0,1.0
7,Bellingen,0,3,4,1,20,0,0.0,3.0,3.0,1.0,7.0,0.0
8,Berrigan,11,5,13,0,8,1,3.0,2.0,5.0,0.0,2.0,1.0
9,Blacktown,4,3,26,1,33,2,3.0,2.0,7.0,1.0,9.0,2.0


In [62]:
replies_df = tweetdf[tweetdf.to_user_id.notnull()]
lga_replies_df = replies_df[['lga','to_user_id']].groupby(['lga']).agg({'to_user_id': ['count']}).reset_index()
lga_replies_df.columns = ['lga','total_reply_tweets']
lga_replies_df

Unnamed: 0,lga,total_reply_tweets
0,Albury,8081
1,Armidale Regional,2235
2,Ballina,1934
3,Balranald,57
4,Bathurst Regional,2270
5,Bayside,2619
6,Bega Valley,1792
7,Bellingen,170
8,Berrigan,600
9,Blacktown,1358


In [63]:
lga_replyers_df = replies_df[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
lga_replyers_df.columns = ['lga','total_replyers']
lga_replyers_df

Unnamed: 0,lga,total_replyers
0,Albury,322
1,Armidale Regional,99
2,Ballina,161
3,Balranald,6
4,Bathurst Regional,181
5,Bayside,240
6,Bega Valley,150
7,Bellingen,41
8,Berrigan,31
9,Blacktown,87


In [64]:
derived_df = pd.merge(derived_df, lga_langdf,on='lga', how='left')
derived_df = pd.merge(derived_df, lga_replies_df[['lga','total_reply_tweets']],on='lga', how='left')
derived_df['reply_ratio'] = derived_df['total_reply_tweets'] / derived_df['total_tweets']
derived_df['replies_per_capita'] = derived_df['total_reply_tweets'] / derived_df['population']
derived_df = pd.merge(derived_df, lga_replyers_df[['lga','total_replyers']],on='lga', how='left')
derived_df['replyer_ratio'] = derived_df['total_replyers'] / derived_df['total_tweeters']
derived_df['replyers_per_capita'] = derived_df['total_replyers'] / derived_df['population']
derived_df

Unnamed: 0,lga,population,total_tweets,total_tweeters,tweets_per_capita,tweeters_per_capita,es,fr,in,ja,...,in_tweeters,ja_tweeters,tl_tweeters,tr_tweeters,total_reply_tweets,reply_ratio,replies_per_capita,total_replyers,replyer_ratio,replyers_per_capita
0,Albury,51076,20942,775,0.410016,0.015173,28,39,38,3,...,26.0,3.0,29.0,4.0,8081,0.385875,0.158215,322,0.415484,0.006304
1,Armidale Regional,29449,5997,306,0.20364,0.010391,11,11,12,2,...,5.0,1.0,5.0,0.0,2235,0.372686,0.075894,99,0.323529,0.003362
2,Ballina,41790,5463,698,0.130725,0.016703,7,5,11,6,...,10.0,4.0,10.0,0.0,1934,0.354018,0.046279,161,0.230659,0.003853
3,Bathurst Regional,41300,6168,608,0.149346,0.014722,13,14,7,4,...,4.0,1.0,7.0,0.0,2270,0.368029,0.054964,181,0.297697,0.004383
4,Bayside,156058,5007,770,0.032084,0.004934,13,13,12,31,...,10.0,6.0,14.0,1.0,2619,0.523068,0.016782,240,0.311688,0.001538
5,Bega Valley,33253,4693,479,0.14113,0.014405,27,5,25,0,...,18.0,0.0,19.0,1.0,1792,0.381845,0.05389,150,0.313152,0.004511
6,Bellingen,12668,1532,196,0.120935,0.015472,0,3,4,1,...,3.0,1.0,7.0,0.0,170,0.110966,0.01342,41,0.209184,0.003237
7,Berrigan,8462,1247,102,0.147365,0.012054,11,5,13,0,...,5.0,0.0,2.0,1.0,600,0.481155,0.070905,31,0.303922,0.003663
8,Blacktown,336962,2675,348,0.007939,0.001033,4,3,26,1,...,7.0,1.0,9.0,2.0,1358,0.507664,0.00403,87,0.25,0.000258
9,Bland,5955,235,31,0.039463,0.005206,0,0,0,0,...,0.0,0.0,0.0,0.0,34,0.144681,0.005709,9,0.290323,0.001511


In [71]:
# HUH? 0 for parra, 43 for Sydney? 12493 for Necastle????
#instadf = tweetdf[tweetdf.source.str.contains('http://instagram\.com')]
parra = tweetdf[tweetdf.lga == 'Parramatta']
len(parra[parra.source.str.contains('instagram')])
# instadf = tweetdf[tweetdf.source.str.contains('instagram')]
# instadf
parra[parra.text.str.contains('http')].text

4323      I've just broken up to four of these at the No...
22638     Learning #POCUS with the best - @ultrasoundpod...
22883     @robvad10 Here's a shot from their first train...
24463     For those interested, the brief for the new We...
25703     tencere yuvarlanmis, kapagini bulmus.... https...
27242     Almost time for tipoff!! #WeAreKings #BackInBl...
40275     Hmm. Some thoughts on this. https://t.co/HpuVX...
40355                            ‚ò∫Ô∏è https://t.co/T3cTkB2a8P
40385                    @ellymelly https://t.co/qfREEKABhe
40487                @paralauragram https://t.co/y0zmJLTins
40490     That 74GW will never be realized. If 'the wind...
40498            It changes colour? https://t.co/zZpDpLVSZ2
40575     ‚ÄúEveryone has a plan until they get punched in...
40626                            ÔøΩÔøΩ https://t.co/VViSCErXJH
40683                               https://t.co/VEXQMdaOKv
40711        This. This is is nice. https://t.co/ONdD8sP9gm
40763                        L

In [72]:
# source -> contains "instagram.com" becomes is_instagram
instadf = tweetdf[tweetdf.source.str.contains('http://instagram\.com')]
lga_instadf = instadf[['lga','source']].groupby(['lga']).agg({'source': ['count']}).reset_index()
lga_instadf.columns = ['lga','total_instagrams']
lga_instadf

Unnamed: 0,lga,total_instagrams
0,Albury,1519
1,Armidale Regional,554
2,Ballina,1840
3,Balranald,25
4,Bathurst Regional,1306
5,Bayside,107
6,Bega Valley,886
7,Bellingen,627
8,Berrigan,87
9,Blacktown,38


In [73]:
lga_instagrammers_df = instadf[['lga','from_user_id']].groupby(['lga']).agg({'from_user_id': lambda x: x.nunique()}).reset_index()
lga_instagrammers_df.columns = ['lga','total_instagrammers']
lga_instagrammers_df

Unnamed: 0,lga,total_instagrammers
0,Albury,263
1,Armidale Regional,115
2,Ballina,415
3,Balranald,16
4,Bathurst Regional,290
5,Bayside,33
6,Bega Valley,231
7,Bellingen,109
8,Berrigan,37
9,Blacktown,21


In [74]:
linkdf = tweetdf[tweetdf.text.str.contains('http')]
lga_linkdf = linkdf[['lga','tweet_id']].groupby(['lga']).count().reset_index()
lga_linkdf.columns = ['lga','total_link_tweets']
lga_linkdf

Unnamed: 0,lga,total_link_tweets
0,Albury,6618
1,Armidale Regional,3085
2,Ballina,3008
3,Balranald,111
4,Bathurst Regional,2741
5,Bayside,1328
6,Bega Valley,2851
7,Bellingen,1285
8,Berrigan,436
9,Blacktown,1019


In [75]:
derived_df = pd.merge(derived_df, lga_instadf,on='lga', how='left')
derived_df = pd.merge(derived_df, lga_instagrammers_df,on='lga', how='left')
derived_df = pd.merge(derived_df, lga_linkdf,on='lga', how='left')
derived_df['instagram_ratio'] = derived_df['total_instagrams'] / derived_df['total_tweets']
derived_df['instagrammers_ratio'] = derived_df['total_instagrammers'] / derived_df['total_tweeters']
derived_df['link_tweets_ratio'] = derived_df['total_link_tweets'] / derived_df['total_tweets']
derived_df

Unnamed: 0,lga,population,total_tweets,total_tweeters,tweets_per_capita,tweeters_per_capita,es,fr,in,ja,...,replies_per_capita,total_replyers,replyer_ratio,replyers_per_capita,total_instagrams,total_instagrammers,total_link_tweets,instagram_ratio,instagrammers_ratio,link_tweets_ratio
0,Albury,51076,20942,775,0.410016,0.015173,28,39,38,3,...,0.158215,322,0.415484,0.006304,1519.0,263.0,6618,0.072534,0.339355,0.316016
1,Armidale Regional,29449,5997,306,0.20364,0.010391,11,11,12,2,...,0.075894,99,0.323529,0.003362,554.0,115.0,3085,0.09238,0.375817,0.514424
2,Ballina,41790,5463,698,0.130725,0.016703,7,5,11,6,...,0.046279,161,0.230659,0.003853,1840.0,415.0,3008,0.336811,0.594556,0.550613
3,Bathurst Regional,41300,6168,608,0.149346,0.014722,13,14,7,4,...,0.054964,181,0.297697,0.004383,1306.0,290.0,2741,0.211738,0.476974,0.44439
4,Bayside,156058,5007,770,0.032084,0.004934,13,13,12,31,...,0.016782,240,0.311688,0.001538,107.0,33.0,1328,0.02137,0.042857,0.265229
5,Bega Valley,33253,4693,479,0.14113,0.014405,27,5,25,0,...,0.05389,150,0.313152,0.004511,886.0,231.0,2851,0.188792,0.482255,0.607501
6,Bellingen,12668,1532,196,0.120935,0.015472,0,3,4,1,...,0.01342,41,0.209184,0.003237,627.0,109.0,1285,0.409269,0.556122,0.838773
7,Berrigan,8462,1247,102,0.147365,0.012054,11,5,13,0,...,0.070905,31,0.303922,0.003663,87.0,37.0,436,0.069767,0.362745,0.349639
8,Blacktown,336962,2675,348,0.007939,0.001033,4,3,26,1,...,0.00403,87,0.25,0.000258,38.0,21.0,1019,0.014206,0.060345,0.380935
9,Bland,5955,235,31,0.039463,0.005206,0,0,0,0,...,0.005709,9,0.290323,0.001511,10.0,8.0,191,0.042553,0.258065,0.812766


In [76]:
derived_df.to_csv("../../data/clean/derived_features_2016.csv",index=False)

In [77]:
len(derived_df.columns)

30