# T-SNE - Bland with only geographic identifiers

In [16]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import OneHotEncoder
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objs as go

In [2]:
#defining directories
dir_path = Path.cwd().parents[2]
processed_path = dir_path.joinpath("data/processed/tsne")
processed_file=processed_path.joinpath("tsne.csv")
processed_file


PosixPath('/mnt/d/work_isb/projects/frer/data/processed/tsne/tsne.csv')

## Data Wrangling

The data will be subsetted to include cases where `success` is non missing and all variables except geographic identifiers of the household will be removed.

In [3]:

df=pd.read_csv(processed_file)

# filetring for only success cases
df=df[df['success'].notna()]

#removing all the non geographic identifiers
cols_to_keep=["hh_id_panel", "sur_yr", "state", "district", "block", "village", "success"]
df=df[cols_to_keep].reset_index(drop=True)
df

Unnamed: 0,hh_id_panel,sur_yr,state,district,block,village,success
0,IBHA0045,2010,Bihar,Patna,Bikram,Arap,1.0
1,IBHA0049,2010,Bihar,Patna,Bikram,Arap,0.0
2,IBHB0038,2010,Bihar,Patna,Bikram,Baghakole,1.0
3,IBHB0043,2010,Bihar,Patna,Bikram,Baghakole,1.0
4,IBHB0050,2010,Bihar,Patna,Bikram,Baghakole,1.0
...,...,...,...,...,...,...,...
612,IMPA0039,2014,Madhya_Pradesh,Raisen,Gairatganj,Papda,1.0
613,IMPA0040,2014,Madhya_Pradesh,Raisen,Gairatganj,Papda,1.0
614,IMPA0054,2014,Madhya_Pradesh,Raisen,Gairatganj,Papda,0.0
615,IMPB0032,2014,Madhya_Pradesh,Raisen,Gairatganj,Rampur_Kalan,1.0


## Initiating T-SNE

T-SNE, in the current exercise, will be limited to only geographicn clustering and shall be prepared separately for each year.

### Preprocessing Data - One-Hot Encoding


In [4]:
meta_file=df[['hh_id_panel', 'sur_yr', 'success']]

In [5]:


onehotencoder=OneHotEncoder()
data_file=pd.DataFrame(onehotencoder.fit_transform(df[['state', 'district', 'block', 'village']]).toarray())

### T-SNE

In [22]:
tsne=TSNE(n_components=2, perplexity=100)
df_tsne=pd.DataFrame(tsne.fit_transform(data_file))
df_tsne

Unnamed: 0,0,1
0,-170.832108,-21.202190
1,-170.832108,-21.202190
2,-123.650719,17.852234
3,-123.650719,17.852234
4,-123.650719,17.852234
...,...,...
612,204.654175,82.103882
613,204.654175,82.103882
614,204.654175,82.103882
615,231.300919,116.216537


In [23]:
out_df=pd.concat([meta_file, df_tsne], axis=1)
out_df

Unnamed: 0,hh_id_panel,sur_yr,success,0,1
0,IBHA0045,2010,1.0,-170.832108,-21.202190
1,IBHA0049,2010,0.0,-170.832108,-21.202190
2,IBHB0038,2010,1.0,-123.650719,17.852234
3,IBHB0043,2010,1.0,-123.650719,17.852234
4,IBHB0050,2010,1.0,-123.650719,17.852234
...,...,...,...,...,...
612,IMPA0039,2014,1.0,204.654175,82.103882
613,IMPA0040,2014,1.0,204.654175,82.103882
614,IMPA0054,2014,0.0,204.654175,82.103882
615,IMPB0032,2014,1.0,231.300919,116.216537


### Graphing T-SNE

In [27]:
fig = px.scatter( x=out_df[0], y=out_df[1], color=out_df['success'].astype(str))
fig.show()