# Multivariate EDA of Domestic Violence Against Women dataset.
#### Kharanshu Manoj Naghera (0792028)

## About Dataset:

This [dataset]("https://www.kaggle.com/datasets/fahmidachowdhury/domestic-violence-against-women") was uploaded on Kaggle by Fahmida Chowdhury and was last updated 22 June 2024. It contains domestic violence aganist women in a specific rural are of a developing country.

This dataset is taken under [Attribution 4.0 International License]("https://creativecommons.org/licenses/by/4.0/").

In [1]:
import pandas as pd
df = pd.read_csv('Domestic violence.csv')
df.head()

Unnamed: 0,SL. No,Age,Education,Employment,Income,Marital status,Violence
0,1,30,secondary,unemployed,0,married,yes
1,2,47,tertiary,unemployed,0,married,no
2,3,24,tertiary,unemployed,0,unmarred,no
3,4,22,tertiary,unemployed,0,unmarred,no
4,5,50,primary,unemployed,0,married,yes


In [2]:
print(df.columns)

Index(['SL. No', 'Age', 'Education ', 'Employment ', 'Income',
       'Marital status ', 'Violence '],
      dtype='object')


In [3]:
# Remove extra spaces from column names
df.columns = df.columns.str.strip()

# Verify the cleaned column names
print(df.columns)

Index(['SL. No', 'Age', 'Education', 'Employment', 'Income', 'Marital status',
       'Violence'],
      dtype='object')


In [4]:
# correcting spelling error
df['Marital status'] = df['Marital status'].replace({'unmarred': 'unmarried'})

In [5]:
# removing spaces from columns where they exist
df['Employment'] = df['Employment'].str.strip()

In [6]:
import plotly.graph_objects as go

grouped_df = df.groupby(['Marital status', 'Education', 'Employment']).size().reset_index(name='count')

marital_status = df['Marital status'].unique().tolist()
education = df['Education'].unique().tolist()
employment = df['Employment'].unique().tolist()

categories = marital_status + education + employment

category_index = {category: index for index, category in enumerate(categories)}

nodes = [{"name": category} for category in categories]

links = []
for _, row in grouped_df.iterrows():
    marital_idx = category_index[row['Marital status']]
    education_idx = category_index[row['Education']]
    employment_idx = category_index[row['Employment']]

    links.append({
        "source": marital_idx,
        "target": education_idx,
        "value": row['count']
    })

    links.append({
        "source": education_idx,
        "target": employment_idx,
        "value": row['count']
    })

color_map = {
    'married': '#34a0a4',
    'unmarried': '#34a0a4',
    'none': '#168aad',
    'primary': '#168aad',
    'secondary': '#168aad',
    'tertiary': '#168aad',
    'unemployed': '#1e6091',
    'employed': '#1e6091',
    'semi employed': '#1e6091'
}

node_colors = [color_map[node['name']] if node['name'] in color_map else '#888' for node in nodes]

fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=[node['name'] for node in nodes],
        color=node_colors
    ),
    link=dict(
        source=[link['source'] for link in links],
        target=[link['target'] for link in links],
        value=[link['value'] for link in links],
        color="rgba(0, 0, 0, 0.2)"
    )
)])

fig.update_layout(
    title_text="Alluvial Diagram of Marital Status, Education, and Employment",
    font=dict(family="cursive", size=12),
    plot_bgcolor='white',
    width=1600,
    height=600,
    title_x=0.5,
    xaxis=dict(showline=True, linewidth=2, linecolor='black'),
    yaxis=dict(showline=True, linewidth=2, linecolor='black')
)

fig.show()


In [19]:
import plotly.express as px

custom_color_sequence = ['#ef476f', '#4361ee']

fig_dot = px.strip(df, x='Marital status', y='Age', color='Violence',
                   category_orders={'Marital status': ['married', 'unmarried']},
                   color_discrete_sequence=custom_color_sequence,
                   title='Dot Plot of Age vs. Marital Status with Violence',
                   hover_data=df.columns
                   )

fig_dot.update_layout(
    xaxis=dict(title='Marital Status', showline=True, linewidth=2, linecolor='black'),
    yaxis=dict(title='Age', showline=True, linewidth=2, linecolor='black'),
    legend_title='Violence',
    font=dict(family="cursive", size=12),
    plot_bgcolor='white',
    width=800,
    height=600,
    title_x=0.5
)

fig_dot.show()

In [21]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Convert Income to numeric, replacing non-numeric values with 0
df['Income'] = pd.to_numeric(df['Income'], errors='coerce').fillna(0)

# Create age bins
age_bins = [0, 20, 30, 40, 50, 60]
age_labels = ['0-20', '21-30', '31-40', '41-50', '51-60']

# Create income bins
income_bins = [0, 1000, 5000, 10000, 20000, 30000, df['Income'].max()]
income_labels = ['0', '1-5k', '5k-10k', '10k-20k', '20k-30k', '30k+']

# Function to create pivot table
def create_pivot(df):
    df['Age_bin'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, include_lowest=True)
    df['Income_bin'] = pd.cut(df['Income'], bins=income_bins, labels=income_labels, include_lowest=True)
    pivot = df.pivot_table(values='SL. No', index='Age_bin', columns='Income_bin', aggfunc='count', fill_value=0)
    return pivot

# Create pivot tables
pivot_yes = create_pivot(df[df['Violence'] == 'yes'])
pivot_no = create_pivot(df[df['Violence'] == 'no'])

# Create subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Violence = Yes", "Violence = No"))

# Custom color sequences
custom_color_sequence = ['#fc2f00', '#c200fb']

# Add heatmaps
fig.add_trace(
    go.Heatmap(z=pivot_yes.values, x=pivot_yes.columns, y=pivot_yes.index, 
               colorscale=[[0, '#fcfcfc'], [1, custom_color_sequence[0]]],
               showscale=False),  # Remove color scale legend
    row=1, col=1
)
fig.add_trace(
    go.Heatmap(z=pivot_no.values, x=pivot_no.columns, y=pivot_no.index, 
               colorscale=[[0, '#fcfcfc'], [1, custom_color_sequence[1]]],
               showscale=False),  # Remove color scale legend
    row=1, col=2
)

# Update layout
fig.update_layout(
    title='Heatmap of Age vs Income for Domestic Violence',
    font=dict(family="cursive", size=12),
    width=1000,
    height=600,
    title_x=0.5,
    paper_bgcolor='#fcfcfc'
)

# Update axes
for i in [1, 2]:
    fig.update_xaxes(title_text="Income", row=1, col=i, showline=True, linewidth=2, linecolor='black')
    fig.update_yaxes(title_text="Age", row=1, col=i, showline=True, linewidth=2, linecolor='black')

# Show the figure
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/