# Importing libraries

In [103]:
import pandas as pd
from scipy.stats import ttest_ind

# Importing data files

In [104]:
df = pd.read_csv("cleaned_datafile.csv")

In [105]:
df.head()

Unnamed: 0,State-Code,Name,TRU,Age-Group,Tot-P,Tot-M,Tot-F,Persons-exactly-1,Males-exactly-1,Females-exactly-1,Persons-exactly-2,Males-exactly-2,Females-exactly-2,Persons-atleast-3,Males-atleast-3,Females-atleast-3
0,0,INDIA,Rural,5-9,93807612,48825259,44982353,84515361,44009879,40505482,8619753,4459004,4160749,672498,356376,316122
1,0,INDIA,Rural,10-14,96804494,50488158,46316336,75843895,39482164,36361731,17321726,9076421,8245305,3638873,1929573,1709300
2,0,INDIA,Rural,15-19,83902472,44570557,39331915,59354806,31228261,28126545,17987978,9726204,8261774,6559688,3616092,2943596
3,0,INDIA,Rural,20-24,73835046,38138662,35696384,51929482,25958518,25970964,15939543,8730193,7209350,5966021,3449951,2516070
4,0,INDIA,Rural,25-29,66068270,33375989,32692281,48015316,23114804,24900512,13483541,7492507,5991034,4569413,2768678,1800735


# Removing irrelevant data and adding column names

In [106]:
df = df[(df['TRU']=='Total') & (df['Age-Group']=='Total')]
cols = ['State-Code','Name','Tot-M','Tot-F','Males-exactly-1','Females-exactly-1','Males-exactly-2','Females-exactly-2','Males-atleast-3','Females-atleast-3']
df = df[cols]

In [107]:
df.head()

Unnamed: 0,State-Code,Name,Tot-M,Tot-F,Males-exactly-1,Females-exactly-1,Males-exactly-2,Females-exactly-2,Males-atleast-3,Females-atleast-3
18,0,INDIA,623270258,587584719,446573875,449292332,126159551,102819639,50536832,35472748
48,1,JAMMU & KASHMIR,6640662,5900640,3005843,3352269,2376063,1710907,1258756,837464
78,2,HIMACHAL PRADESH,3481873,3382729,2797907,2823990,483488,411931,200478,146808
108,3,PUNJAB,14639465,13103873,7355282,7352832,2900342,2304969,4383841,3446072
138,4,CHANDIGARH,580663,474787,257765,217765,144341,113600,178557,143422


# Finding required values

## finding required values for people who can speak exactly one language

In [108]:
result1 = pd.DataFrame(columns=['state-code','male-percentage','female-percentage','p-value'])
result1[['state-code']] = df[['State-Code']]
result1['male-percentage'] = (df['Males-exactly-1']/df['Tot-M'])*100
result1['female-percentage'] = (df['Females-exactly-1']/df['Tot-F'])*100

In [109]:
r1=[]
r2=[]
for index, row in df.iterrows():
    a = row['Males-exactly-1']/row['Females-exactly-1']
    b = row['Males-exactly-2']/row['Females-exactly-2']
    c = row['Males-atleast-3']/row['Females-atleast-3']
    d = row['Tot-M']/row['Tot-F']
    r1.append([a,b,c])
    r2.append([d,d,d])

In [110]:
p_value = []
for x,y in zip(r1,r2):
    p_value.append(ttest_ind(x,y,equal_var=False).pvalue)

In [111]:
result1['p-value'] = p_value

In [112]:
result1.head()

Unnamed: 0,state-code,male-percentage,female-percentage,p-value
18,0,71.650118,76.464264,0.340433
48,1,45.264207,56.811956,0.537034
78,2,80.356377,83.482596,0.30633
108,3,50.242833,56.111899,0.568812
138,4,44.391497,45.86583,0.733277


## finding required values for people who can speak exactly two language

In [113]:
result2 = pd.DataFrame(columns=['state-code','male-percentage','female-percentage','p-value'])
result2[['state-code']] = df[['State-Code']]
result2['male-percentage'] = (df['Males-exactly-2']/df['Tot-M'])*100
result2['female-percentage'] = (df['Females-exactly-2']/df['Tot-F'])*100

In [114]:
result2['p-value'] = p_value

In [115]:
result2.head()

Unnamed: 0,state-code,male-percentage,female-percentage,p-value
18,0,20.241548,17.498692,0.340433
48,1,35.780514,28.995278,0.537034
78,2,13.88586,12.177476,0.30633
108,3,19.811803,17.589983,0.568812
138,4,24.857964,23.926519,0.733277


## finding required values for people who can speak at least 3 languages

In [116]:
result3 = pd.DataFrame(columns=['state-code','male-percentage','female-percentage','p-value'])
result3[['state-code']] = df[['State-Code']]
result3['male-percentage'] = (df['Males-atleast-3']/df['Tot-M'])*100
result3['female-percentage'] = (df['Females-atleast-3']/df['Tot-F'])*100

In [117]:
result3['p-value'] = p_value

In [118]:
result3.head()

Unnamed: 0,state-code,male-percentage,female-percentage,p-value
18,0,8.108334,6.037044,0.340433
48,1,18.955279,14.192766,0.537034
78,2,5.757763,4.339928,0.30633
108,3,29.945363,26.298118,0.568812
138,4,30.750539,30.207651,0.733277


In [119]:
result1.to_csv("gender-india-a.csv",index = False)
result2.to_csv("gender-india-b.csv",index = False)
result3.to_csv("gender-india-c.csv",index = False)