# Analyse LGBTQIA+ Survey

## Imports

In [14]:
import pandas as pd
import numpy as np
import re
import unidecode
import datetime
import time
import itertools
import operator
from difflib import SequenceMatcher

df = pd.read_csv("../data/LGBT_Survey_ViolenceAndHarassment.csv")
df.head()

Unnamed: 0,CountryCode,subset,question_code,question_label,answer,percentage,notes
0,Austria,Lesbian,e1,Do you avoid holding hands in public with a sa...,Yes,33,
1,Austria,Lesbian,e1,Do you avoid holding hands in public with a sa...,No,53,
2,Austria,Lesbian,e1,Do you avoid holding hands in public with a sa...,I do not have a same-sex partner,12,
3,Austria,Lesbian,e1,Do you avoid holding hands in public with a sa...,Don`t know,2,[0]
4,Austria,Gay,e1,Do you avoid holding hands in public with a sa...,Yes,51,


## Data Preparation

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45355 entries, 0 to 45354
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CountryCode     45355 non-null  object
 1   subset          45355 non-null  object
 2   question_code   45355 non-null  object
 3   question_label  45355 non-null  object
 4   answer          45355 non-null  object
 5   percentage      45355 non-null  object
 6   notes           31969 non-null  object
dtypes: object(7)
memory usage: 2.4+ MB


In [16]:
# check the countries
df.CountryCode.unique()

array(['Austria', 'Belgium', 'Bulgaria', 'Cyprus', 'Czech Republic',
       'Germany', 'Denmark', 'Estonia', 'Greece', 'Spain', 'Finland',
       'France', 'Croatia', 'Hungary', 'Ireland', 'Italy', 'Lithuania',
       'Luxembourg', 'Latvia', 'Malta', 'Netherlands', 'Poland',
       'Portugal', 'Romania', 'Sweden', 'Slovenia', 'Slovakia',
       'United Kingdom', 'Average'], dtype=object)

In [17]:
# check the subset column
df.subset.unique()

array(['Lesbian', 'Gay', 'Bisexual women', 'Bisexual men', 'Transgender'],
      dtype=object)

In [18]:
# check if the amount of codes and labels are the same
len(df.question_code.unique()) == len(df.question_label.unique())

True

In [19]:
# check the notes column: [0]: small sample size; [1]: NA due to small sample size; [2]: missing value
df.notes.value_counts() 

[0]      25072
 [1]      6897
Name: notes, dtype: int64

In [20]:
# check the questions
df.question_label.unique().tolist()

['Do you avoid holding hands in public with a same-sex partner for fear of being assaulted, threatened of harassed?',
 'Do you avoid certain places or locations for fear of being assaulted, threatened or harassed because you are L, G, B or T?',
 'Where do you avoid being open about yourself as L, G, B or T for fear of being assaulted, threatened or harassed by others?',
 'In the last 5 years, have you been: physically/sexually attacked or threatened with violence at home or elsewhere (street, on public transport, at your workplace, etc) for any reason?',
 'In the last 5 years, have you been: personally harassed by someone or a group for any reason in a way that really annoyed, offended or upset you - either at work, home, on the street, on public transport, in a shop, in an office or on the internet ?',
 'Where did the last incident of physical / sexual attack or threat of violence happen?',
 'Did you or anyone else report the last incident of physical / sexual attack or threat of viol

## What are the most dangerous places for LGBTQIA+ people?

In [21]:
def filterQuestions(keyword):
    questions = df.question_label.unique().tolist()
    keyword_list = []
    
    for question in questions:
        if keyword in question:
            keyword_list.append(question)
            
    return keyword_list

In [22]:
place_questions = filterQuestions('Where')
place_questions

['Where do you avoid being open about yourself as L, G, B or T for fear of being assaulted, threatened or harassed by others?',
 'Where did the last incident of physical / sexual attack or threat of violence happen?',
 'The MOST SERIOUS physical / sexual attack or threat of violence - Where did it happen?',
 'LAST incident of harassment in the past 12 months - Where did it happen?',
 'MOST SERIOUS incident of harassment - Where did it happen?']

In [23]:
df_place = df[df.question_label.isin(place_questions)]
df_place = df_place[df_place.percentage != ':']

df_place.percentage = df_place.percentage.astype(int)

df_place.head()

Unnamed: 0,CountryCode,subset,question_code,question_label,answer,percentage,notes
1015,Austria,Lesbian,e3,Where do you avoid being open about yourself a...,My home,6,[0]
1016,Austria,Lesbian,e3,Where do you avoid being open about yourself a...,School,15,
1017,Austria,Lesbian,e3,Where do you avoid being open about yourself a...,Workplace,36,
1018,Austria,Lesbian,e3,Where do you avoid being open about yourself a...,"A cafe, restaurant, pub, club",39,
1019,Austria,Lesbian,e3,Where do you avoid being open about yourself a...,Public transport,53,


In [27]:
for subset in df.subset.unique().tolist(): 
    df_subset = df_place[df_place.subset == subset]
    df_subset = df_subset.groupby('answer').sum().sort_values(by='percentage', ascending=False)
    df_subset = df_subset[df_subset.percentage != 0]
    
    answers_list = df_subset.index.tolist()

    for answer_1 in answers_list:
        for answer_2 in answers_list:
            similarity = SequenceMatcher(a=answer_1,b=answer_2).ratio()
            if(similarity > 0.6 and similarity != 1):
                percentage = df_subset[df_subset.index == answer_2]['percentage'].tolist()[0]
                df_subset[df_subset.index == answer_1] += percentage
                answers_list.remove(answer_2)
                df_subset.drop(answer_2, inplace=True)
    print("*"*50)
    print('\nSubset: ' + subset)
    print(df_subset[df_subset.percentage >= df_subset.percentage.quantile(0.75)])
            

    

**************************************************

Subset: Lesbian
                                                    percentage
answer                                                        
In a street, square, car parking lot or other p...        5293
Public transport                                          2489
Public premises or buildings                              1554
A cafe, restaurant, pub, club                             2336
Workplace                                                 2098
**************************************************

Subset: Gay
                                                    percentage
answer                                                        
In a street, square, car parking lot or other p...        6372
Public transport                                          2749
Public premises or buildings                              1829
A cafe, restaurant, pub, club                             2683
Workplace                                        