In [None]:
#Data Source URL: https://www.kaggle.com/albabshams/suicides-in-india-analysed/data

import os
PATH = '../input'
os.listdir(PATH)

Import statements

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from subprocess import check_output
df = pd.read_csv('../input/suicides-in-india-20072018/Suicides in India 2007-2018.csv')
df.head()

In [None]:
df.shape # Find the size of dataframe

In [None]:
df.info() # Find the count and datatype of field

In [None]:
df.describe() #Find statistics like count, mean, standard deviation, etc

In [None]:
#Segregate the data by Type_code : Eg. Education status of persons doing suicide
eduDf = df[df['Type_code']=='Education_Status']
df.shape, eduDf.shape

In [None]:
#Segregate the data by Type_code
causesDf = df[df['Type_code']=='Causes']
profDf = df[df['Type_code']=='Professional_Profile']
socialDf = df[df['Type_code']=='Social_Status']

In [None]:
df.shape, eduDf.shape, causesDf.shape, profDf.shape, socialDf.shape

In [None]:
eduDf.head() # Similarly we can display for every Type_code

**1. Distribution of suicides according to Educational status and Gender**

In [None]:
eduDf = eduDf[['Type','Gender','Total']] #Extract Type, Gender and Total
eduDf.head()

In [None]:
#Group the data by 1. Type and 2. Gender, Calculate the sum of each group, sort the values by Total
edSort = eduDf.groupby(['Type','Gender'],as_index=False).sum().sort_values('Total',ascending=False) 
#as_index=False -> Type and Gender will not be used as Index
edSort.head()

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x='Type',y='Total',hue='Gender',data=edSort,palette='Set1')
#hue = Legend, Palette= Color Scheme (Palette types: plasma, viridis, inferno, magma, Blues, Accent, Dark2, Paired etc)
#for palette list: https://bokeh.pydata.org/en/latest/docs/reference/palettes.html
plt.xticks(rotation=45,ha='right')
plt.tight_layout()

Number of suicides appear to be concentrated towards Lower education level. Most of the people who have committed suicides have education level below Matriculate/Secondary.

**2. Distribution of number of suicides on the basis of social status**

In [None]:
socialDf = socialDf[['Type','Gender','Total']]
socialSort = socialDf.groupby(['Type','Gender'],as_index=False).sum().sort_values('Total',ascending=False)
socialSort.head()

In [None]:
plt.figure(figsize=(9,6))
sns.barplot(x='Type',y='Total',data=socialSort,hue='Gender',palette='Set1')
plt.xticks(rotation=45,ha='right')
plt.tight_layout()

**3. Distribution of number of suicided on the basis of Cause**

In [None]:
causesDf.is_copy = False
causesDf.loc[causesDf['Type']=='Bankruptcy or Sudden change in Economic','Type'] = 'Change in Economic Status'
causesDf.loc[causesDf['Type']=='Bankruptcy or Sudden change in Economic Status','Type'] = 'Change in Economic Status'
causesDf.loc[causesDf['Type']=='Other Causes (Please Specity)','Type'] = 'Causes Not known'
causesDf.loc[causesDf['Type']=='Not having Children (Barrenness/Impotency','Type'] = 'Not having Children(Barrenness/Impotency'
plt.figure(figsize=(12,6))
causesDf = causesDf[['Type','Gender','Total']]
causesSort = causesDf.groupby(['Type','Gender'],as_index=False).sum().sort_values('Total',ascending=False)
sns.barplot(x='Type',y='Total',data=causesSort,hue='Gender',palette='Set1')
plt.xticks(rotation=45,ha='right')
plt.tight_layout()

While most of the causes of the suicides are not known, the three major causes among the known cases are Family problems, Prolonged illness and mental illness.

According to the data men seem to be badly effected by unemployment, property dispute, poverty,drug abuse or addiction and change in economic status than women.

Number of women who have committed suicides due to dowry disputes is much higher when compared to men,

**4. Distribution of number of suicided on the basis of Professional Profile**

In [None]:
plt.figure(figsize=(12,6))
profDf = profDf[['Type','Gender','Total']]
profSort = profDf.groupby(['Type','Gender'],as_index=False).sum().sort_values('Total',ascending=False)
sns.barplot(x='Type',y='Total',data=profSort,hue='Gender',palette='Set1')
plt.xticks(rotation=45,ha='right')
plt.tight_layout()

According to the data, most of the females who have committed suicides are house wives. The percentage of female suicides in all other categories is much lower than the percentage of male suicides. While this is an indication of lower representation by women in professional careers, it also reiterates the importance of girl education and women empowerment. The graph shows that financially independent women are much mentally stronger.
It is disheartening to see that farmers who feed the rest of the country are the ones who are more committing suicides than any other profession. Followed by farmers, it's the unemployed and private sector employees who are most effected. It's surprising to know that the number of suicides among the unemployed and the private sector employees is almost the same. This also might be due to higher pressure in private sector when compared to government sector jobs.

**5. Distribution of number of suicides on the basis of State and Age group**

In [None]:
causes = df[df['Type_code']=='Causes']
causes.head()

In [None]:
causesGrp = causes.groupby(['State','Age_group'],as_index=False).sum()
causesGrp.head()

In [None]:
causesGrpPvt = causesGrp.pivot(index='Age_group',columns='State',values='Total')
causesGrpPvt.head()

In [None]:
plt.figure(figsize=(14,6))
plt.xticks(rotation=45,ha='right')
sns.heatmap(causesGrpPvt,cmap='YlGnBu')
plt.tight_layout()

This plot gives us an insight on the number of suicides, based on state and age group. As we can see 15-29 is the most vulnerable age in all states except Kerala. Maharashtra is the state with most number of suicides.Number of suicides per square kilometre might give us some more information on this.

**6. Distribution of number of suicides on the basis of State and per square km**

In [None]:
edu = df[df['Type_code']=='Education_Status']
edu.head()

In [None]:
st = edu.groupby(['State','Gender'],as_index=False).sum().sort_values('Total',ascending=False)
st.head()

In [None]:
#Removing the entries like "Total (Stetes)", "(All India)" from the output
st = st[(st['State']!='Total (States)') & (st['State']!='Total (All India)') & (st['State']!='Total (Uts)')]
st.head()

In [None]:
# values for areas are taken from wikipedia
statesArea = {'Maharashtra':307713,'West Bengal':88752,'Tamil Nadu':130058,'Andhra Pradesh':275045,'Karnataka':191791,'Kerala':38863,'Madhya Pradesh':308350,'Gujarat':196024,'Chhattisgarh':135191,'Odisha':155707,'Rajasthan':342239,'Uttar Pradesh':243290,'Assam':78438,'Haryana':44212,'Delhi (Ut)':1484,'Jharkhand':79714,'Punjab':50362,'Bihar':94163,'Tripura':10486,'Puducherry':562,'Himachal Pradesh':55673,'Uttarakhand':53483,'Goa':3702,'Jammu & Kashmir':222236,'Sikkim':7096,'A & N Islands':8249,'Arunachal Pradesh':83743,'Meghalaya':22429,'Chandigarh':114,'Mizoram':21081,'D & N Haveli':491,'Manipur':22327,'Nagaland':16579,'Daman & Diu':112,'Lakshadweep':32}
statesArea

In [None]:
#To add the "Area" Column into the table/ dataset
for state in statesArea.keys():
    st.loc[st['State']==state,'Area'] = statesArea[state] 
st.head()

In [None]:
st['Suicides_per_squareKm'] = st['Total']/st['Area']
st.head()

In [None]:
#Sorting values based of Suicides_per_squareKm in Descending order
sortedStates = st.sort_values('Suicides_per_squareKm',ascending=False)
sortedStates.head()

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x='State',y='Suicides_per_squareKm',data=sortedStates,hue='Gender',palette='Set1')
plt.xticks(rotation=45,ha='right')
plt.tight_layout()

Union territories whose area is much smaller compared other states have higher number of suicides per square kilometre. Again, Kerala is an exception here as it larger compared all other union territories. Kerala is followed by West Bengal and Tamil Nadu among the Indian states to register higher number of suicides.

**7. Distribution of number of suicides in India overall**

In [None]:
indiaOverall = df[(df['Type_code']=='Education_Status') & (df['State']=='Total (All India)')]
indiaOverall.head()

In [None]:
overall = indiaOverall.groupby(['Year'],as_index=False).sum()
overall.head()

In [None]:
plt.figure(figsize=(9,4))
plt.xticks(rotation=45,ha='right')
sns.barplot(x='Year',y='Total',data=overall,palette='Set1').set_title('Suicides in India overall')
plt.tight_layout()

In [None]:
#To calculate the percentage increase in suicides
Suicides_in_2018 = int(overall[overall['Year']==2018]['Total'])
Suicides_in_2007 = int(overall[overall['Year']==2007]['Total'])
(Suicides_in_2018 - Suicides_in_2007)*100/Suicides_in_2018

The total number of suicides in the conutry are increasing with time. 
From 2007 to 2018 the percentage increase in suicides is 20% which is very scary