# Analyse LGBTQIA+ Survey

## Imports

In [64]:
import pandas as pd
import numpy as np
import re
import unidecode
import datetime
import time
import itertools
import operator
from difflib import SequenceMatcher

df = pd.read_csv("../data/LGBT_Survey_ViolenceAndHarassment.csv")
df.head()

Unnamed: 0,CountryCode,subset,question_code,question_label,answer,percentage,notes
0,Austria,Lesbian,e1,Do you avoid holding hands in public with a sa...,Yes,33,
1,Austria,Lesbian,e1,Do you avoid holding hands in public with a sa...,No,53,
2,Austria,Lesbian,e1,Do you avoid holding hands in public with a sa...,I do not have a same-sex partner,12,
3,Austria,Lesbian,e1,Do you avoid holding hands in public with a sa...,Don`t know,2,[0]
4,Austria,Gay,e1,Do you avoid holding hands in public with a sa...,Yes,51,


## Data Preparation

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45355 entries, 0 to 45354
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CountryCode     45355 non-null  object
 1   subset          45355 non-null  object
 2   question_code   45355 non-null  object
 3   question_label  45355 non-null  object
 4   answer          45355 non-null  object
 5   percentage      45355 non-null  object
 6   notes           31969 non-null  object
dtypes: object(7)
memory usage: 2.4+ MB


In [66]:
# check the countries
df.CountryCode.unique()

array(['Austria', 'Belgium', 'Bulgaria', 'Cyprus', 'Czech Republic',
       'Germany', 'Denmark', 'Estonia', 'Greece', 'Spain', 'Finland',
       'France', 'Croatia', 'Hungary', 'Ireland', 'Italy', 'Lithuania',
       'Luxembourg', 'Latvia', 'Malta', 'Netherlands', 'Poland',
       'Portugal', 'Romania', 'Sweden', 'Slovenia', 'Slovakia',
       'United Kingdom', 'Average'], dtype=object)

In [67]:
# check the subset column
df.subset.unique()

array(['Lesbian', 'Gay', 'Bisexual women', 'Bisexual men', 'Transgender'],
      dtype=object)

In [68]:
# check if the amount of codes and labels are the same
len(df.question_code.unique()) == len(df.question_label.unique())

True

In [69]:
# check the notes column: [0]: small sample size; [1]: NA due to small sample size; [2]: missing value
df.notes.value_counts() 

[0]      25072
 [1]      6897
Name: notes, dtype: int64

In [70]:
# check the questions
df.question_label.unique().tolist()

['Do you avoid holding hands in public with a same-sex partner for fear of being assaulted, threatened of harassed?',
 'Do you avoid certain places or locations for fear of being assaulted, threatened or harassed because you are L, G, B or T?',
 'Where do you avoid being open about yourself as L, G, B or T for fear of being assaulted, threatened or harassed by others?',
 'In the last 5 years, have you been: physically/sexually attacked or threatened with violence at home or elsewhere (street, on public transport, at your workplace, etc) for any reason?',
 'In the last 5 years, have you been: personally harassed by someone or a group for any reason in a way that really annoyed, offended or upset you - either at work, home, on the street, on public transport, in a shop, in an office or on the internet ?',
 'Where did the last incident of physical / sexual attack or threat of violence happen?',
 'Did you or anyone else report the last incident of physical / sexual attack or threat of viol

## What are the most dangerous places for LGBTQIA+ people?

In [71]:
def filterQuestions(keyword):
    questions = df.question_label.unique().tolist()
    keyword_list = []
    
    for question in questions:
        if keyword in question:
            keyword_list.append(question)
            
    return keyword_list

def popularAnswers(df):
    for subset in df.subset.unique().tolist():
        df_subset = df[df.subset == subset]
        df_subset = df_subset.groupby('answer').sum().sort_values(by='percentage', ascending=False)
        df_subset = df_subset[df_subset.percentage != 0]
        
        answers_list = df_subset.index.tolist()

        for answer_1 in answers_list:
            for answer_2 in answers_list:
                similarity = SequenceMatcher(a=answer_1,b=answer_2).ratio()
                if(similarity > 0.6 and similarity != 1):
                    percentage = df_subset[df_subset.index == answer_2]['percentage'].tolist()[0]
                    df_subset[df_subset.index == answer_1] += percentage
                    answers_list.remove(answer_2)
                    df_subset.drop(answer_2, inplace=True)
        print("*"*50)
        print('\nSubset: ' + subset)
        print(df_subset[df_subset.percentage >= df_subset.percentage.quantile(0.75)].sort_values('percentage', ascending=False))
        
def filterDataByQuestions(df, questions_list):
    df_questions = df[df.question_label.isin(questions_list)]
    df_questions = df_questions[df_questions.percentage != ':']
    
    df_questions.percentage = df_questions.percentage.astype(int)
    
    return df_questions

In [72]:
place_questions = filterQuestions('Where')

df_place = filterDataByQuestions(df, place_questions)
popularAnswers(df_place)

**************************************************

Subset: Lesbian
                                                    percentage
answer                                                        
In a street, square, car parking lot or other p...        5293
Public transport                                          2489
A cafe, restaurant, pub, club                             2336
Workplace                                                 2098
Public premises or buildings                              1554
**************************************************

Subset: Gay
                                                    percentage
answer                                                        
In a street, square, car parking lot or other p...        6372
Public transport                                          2749
A cafe, restaurant, pub, club                             2683
Workplace                                                 2362
Public premises or buildings                     

## Perpetrator: who was, gender and if their was alone

In [73]:
filterQuestions('perpetrator')

['Was the perpetrator alone, or was there more than one perpetrator?',
 'LAST incident of physical / sexual attack or threat of violence in the past 12 months - who was the perpetrator?',
 'LAST incident of physical / sexual attack or threat of violence in the past 12 months - What was the gender of the perpetrator(s)?',
 'LAST incident of physical / sexual attack or threat of violence in the past 12 months - What do you think the perpetrator(s) was? *',
 'MOST SERIOUS physical / sexual attack or threat of violence - Was the perpetrator alone, or was there more than one perpetrator?',
 'MOST SERIOUS physical / sexual attack or threat of violence - Do you think the perpetrator(s) was? *',
 'MOST SERIOUS physical / sexual attack or threat of violence - What was the gender of the perpetrator(s)?',
 'MOST SERIOUS physical / sexual attack or threat of violence - Do you think the perpetrator(s) was? (sexual orientation)',
 'LAST incident of harassment in the past 12 months - Was the perpetra

In [74]:
amount_questions = filterQuestions('Was the perpetrator alone')

df_amount = filterDataByQuestions(df, amount_questions)
popularAnswers(df_amount)

**************************************************

Subset: Lesbian
                   percentage
answer                       
More perpetrators        5135
**************************************************

Subset: Gay
                   percentage
answer                       
More perpetrators        7092
**************************************************

Subset: Bisexual men
                   percentage
answer                       
More perpetrators        4746
**************************************************

Subset: Transgender
                   percentage
answer                       
More perpetrators        5121
**************************************************

Subset: Bisexual women
        percentage
answer            
Alone         4567


In [75]:
gender_questions = filterQuestions('gender of the perpetrator')

df_gender = filterDataByQuestions(df, gender_questions)
popularAnswers(df_gender)

**************************************************

Subset: Lesbian
        percentage
answer            
Male          6887
**************************************************

Subset: Gay
        percentage
answer            
Male          9074
**************************************************

Subset: Bisexual men
        percentage
answer            
Male          6078
**************************************************

Subset: Transgender
        percentage
answer            
Male          6113
**************************************************

Subset: Bisexual women
        percentage
answer            
Male          5835


In [76]:
identity_questions = filterQuestions('who was the perpetrator')

df_identity = filterDataByQuestions(df, identity_questions)
popularAnswers(df_identity)

**************************************************

Subset: Lesbian
                                            percentage
answer                                                
Someone else you didn`t know                      4356
Teenager or group of teenagers                    1945
Someone from school, college or university        1063
**************************************************

Subset: Gay
                                            percentage
answer                                                
Someone else you didn`t know                      4605
Teenager or group of teenagers                    2636
Someone from school, college or university        1542
**************************************************

Subset: Bisexual men
                                            percentage
answer                                                
Someone else you didn`t know                      3232
Teenager or group of teenagers                    1727
Someone from school, coll

## Police: Did you or anyone else report it to the police? /  Why did you not report it to the police?

In [77]:
filterQuestions('police')

['Did you or anyone else report the last incident of physical / sexual attack or threat of violence to the police?',
 'Why did you not report the last incident of physical / sexual attack or threat of violence to the police?',
 'MOST SERIOUS physical / sexual attack or threat of violence - Did you or anyone else report it to the police?',
 'MOST SERIOUS physical / sexual attack or threat of violence - Why did you not report it to the police?',
 'LAST incident of harassment in the past 12 months - Did you or anyone else report it to the police?',
 'LAST incident of harassment in the past 12 months - Why did you not report it to the police?',
 'MOST SERIOUS incident of harassment - Did you or anyone else report it to the police?',
 'MOST SERIOUS incident of harassment - Why did you not report it to the police?']

In [78]:
report_questions = filterQuestions('Did you or anyone else report')

df_report = filterDataByQuestions(df, report_questions)
popularAnswers(df_report)

**************************************************

Subset: Lesbian
                    percentage
answer                        
No                        9160
No, did not report        9136
Yes                       1118
**************************************************

Subset: Gay
                    percentage
answer                        
No, did not report       10064
No                        9760
Yes                       1440
**************************************************

Subset: Bisexual men
                    percentage
answer                        
No, did not report        7057
No                        6811
Yes                        985
**************************************************

Subset: Transgender
                    percentage
answer                        
No                        7537
No, did not report        7417
Yes                       1150
**************************************************

Subset: Bisexual women
                    percenta

In [79]:
reason_questions = filterQuestions('Why did you not report')

df_reasons = filterDataByQuestions(df, reason_questions)
popularAnswers(df_reasons)

**************************************************

Subset: Lesbian
                                                    percentage
answer                                                        
Did not think they would do anything                      6566
Too minor / not serious enough / never occurred...        4799
Dealt with it myself/involved a friend/family m...        2457
Shame, embarrassment, didn`t want anyone to know          1548
**************************************************

Subset: Gay
                                                    percentage
answer                                                        
Did not think they would do anything                      8086
Too minor / not serious enough / never occurred...        4883
Shame, embarrassment, didn`t want anyone to know          2603
Fear of a homophobic and/or transphobic reactio...        2415
**************************************************

Subset: Transgender
                                        