# t-SNE - Prototype 2
This t-SNE exercise utilizes a dataset which has been better restructured to suit a deep learning exercise. The final dataset includes all the pre-decided information except those stemming from:
- benefits from government development programs
- proactive measures
- goverenment assistance information

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np


#ML imports
from sklearn.manifold import TSNE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#graph imports
import plotly.express as px
import plotly.graph_objs as go

In [3]:
#defining directories
dir_path = Path.cwd().parents[2]
processed_path = dir_path.joinpath("data/processed/tsne")
processed_file=processed_path.joinpath("tsne_prototype_2.csv")
processed_file


PosixPath('/mnt/d/work_isb/projects/frer/data/processed/tsne/tsne_prototype_2.csv')

## Data Wrangling

Unlike prototype 1, we shall including the entire data and subset on the basis of `success` will not be considered.

In [4]:

df=pd.read_csv(processed_file, low_memory=False)
df

Unnamed: 0,hh_id_panel,sur_yr,facility_Area Of Courtyard,facility_Bathroom,facility_Cable TV,facility_Cooking Gas,facility_Drinking Water Well,facility_Electrified,facility_Internet Connection,facility_Residential House,...,unit_price_stock_pulses,total_value_stock_cereals,total_value_stock_cooking fuel,total_value_stock_fodders,total_value_stock_oil seeds,total_value_stock_other items,total_value_stock_pulses,total_prodn,diff,success
0,IAPA0008,2010.0,1080.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,209.0,3088.0,520.0,,,230.0,142.0,,,
1,IAPA0008,2011.0,1080.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,60.0,1280.0,826.0,,,430.0,30.0,,,
2,IAPA0008,2012.0,1080.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,65.0,1560.0,630.0,,,1120.0,130.0,,,
3,IAPA0008,2013.0,1080.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,65.0,3235.0,1150.0,,,,195.0,93002.47,,
4,IAPA0009,2010.0,480.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,75.0,420.0,270.0,,,120.0,37.5,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6772,IORB0203,2011.0,,,,,,,,,...,,,,,,,,,,
6773,IORB0204,2011.0,,,,,,,,,...,,,,,,,,,,
6774,IORB0205,2011.0,,,,,,,,,...,,,,,,,,,,
6775,IBHA0080,2011.0,,,,,,,,,...,,,,,,,,,,


## Summary Statistics

In [17]:
df['hh_id_panel'].nunique()

1527

Total Number of households across years: 1527

Total Number of households per years:

In [18]:
df.groupby('sur_yr')['hh_id_panel'].nunique()

sur_yr
2010.0    1346
2011.0    1352
2012.0    1349
2013.0    1359
2014.0    1371
Name: hh_id_panel, dtype: int64

## Initiating T-SNE

### Preprocessing Data - Missing Value Imputation
The prototyoe 2 final csv file was preprocessed to encode all the categorical variables. The only remianing preprocessing exercise is missing value imputation.


In [5]:
# isloating the feature names
cols_missing = [
    col for col in df.columns if col not in ["hh_id_panel", "sur_yr", "total_prodn", "diff", "success"]
]


In [6]:
left_hand_file=df[["hh_id_panel", "sur_yr", "total_prodn", "diff", "success"]]
left_hand_file

Unnamed: 0,hh_id_panel,sur_yr,total_prodn,diff,success
0,IAPA0008,2010.0,,,
1,IAPA0008,2011.0,,,
2,IAPA0008,2012.0,,,
3,IAPA0008,2013.0,93002.47,,
4,IAPA0009,2010.0,,,
...,...,...,...,...,...
6772,IORB0203,2011.0,,,
6773,IORB0204,2011.0,,,
6774,IORB0205,2011.0,,,
6775,IBHA0080,2011.0,,,


In [7]:

imputer=IterativeImputer(random_state=100, max_iter=50)
righ_hand_file=pd.DataFrame(imputer.fit_transform(df[cols_missing]).toarray())
righ_hand_file

### T-SNE

In [22]:
tsne=TSNE(n_components=2, perplexity=100)
df_tsne=pd.DataFrame(tsne.fit_transform(data_file))
df_tsne

Unnamed: 0,0,1
0,-170.832108,-21.202190
1,-170.832108,-21.202190
2,-123.650719,17.852234
3,-123.650719,17.852234
4,-123.650719,17.852234
...,...,...
612,204.654175,82.103882
613,204.654175,82.103882
614,204.654175,82.103882
615,231.300919,116.216537


In [23]:
out_df=pd.concat([meta_file, df_tsne], axis=1)
out_df

Unnamed: 0,hh_id_panel,sur_yr,success,0,1
0,IBHA0045,2010,1.0,-170.832108,-21.202190
1,IBHA0049,2010,0.0,-170.832108,-21.202190
2,IBHB0038,2010,1.0,-123.650719,17.852234
3,IBHB0043,2010,1.0,-123.650719,17.852234
4,IBHB0050,2010,1.0,-123.650719,17.852234
...,...,...,...,...,...
612,IMPA0039,2014,1.0,204.654175,82.103882
613,IMPA0040,2014,1.0,204.654175,82.103882
614,IMPA0054,2014,0.0,204.654175,82.103882
615,IMPB0032,2014,1.0,231.300919,116.216537


### Graphing T-SNE

In [27]:
fig = px.scatter( x=out_df[0], y=out_df[1], color=out_df['success'].astype(str))
fig.show()