# 1. Importing libraries

## 1.1 Installing packages

In [3]:
!pip install translate

Collecting translate
  Downloading translate-3.6.1-py2.py3-none-any.whl (12 kB)
Collecting libretranslatepy==2.1.1 (from translate)
  Downloading libretranslatepy-2.1.1-py3-none-any.whl (3.2 kB)
Installing collected packages: libretranslatepy, translate
Successfully installed libretranslatepy-2.1.1 translate-3.6.1


In [5]:
!pip install deepl

Collecting deepl
  Downloading deepl-1.18.0-py3-none-any.whl (35 kB)
Installing collected packages: deepl
Successfully installed deepl-1.18.0


## 1.2 Adding source and path to find files (Google Colab)

In [1]:
import os
from google.colab import drive

In [2]:
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Voice of Nature/Data/'
os.chdir(path)

Mounted at /content/drive












# 2. Loading data

## 2.1 Libraries and functions

In [6]:
import pandas as pd
import numpy as np
import re

# Translation
from translate import Translator
import deepl
translator = deepl.Translator() # add ypur key here

# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def replace(text, replacements=None, lower=True):
    '''Replaces multiple characters defined in a dictionary in textual data.'''
    if lower == True:
      text = text.lower()

    if replacements != None:
      for old, new in replacements.items():
          text = str(text).replace(old, new)

    return text

## 2.2 Data loading and preparation

In [None]:
# importing the excel with all sheets - takes 30 sec
dfs_import = pd.read_excel('ndhvndata_clean_200923_MMnamedtabs.xlsx', sheet_name=None)

In [None]:
# renaming sheets for easier coding
replacements = {'ø': 'o', '+': '', '-': '_', ' ': '_', '__': '_'}

new_keys = [replace(key, replacements) for key in dfs_import.keys()]
dfs = dict(zip(new_keys, dfs_import.values()))

In [None]:
new_keys

['respondents',
 'home',
 'sheet_names_code',
 'important_places_obro',
 'disliked_places_obro',
 'important_places_ndhvn',
 'wild_nature',
 'lawns_flowers',
 'social_areas',
 'sports_facilities',
 'facilities_other',
 'other',
 'no_wild_nature',
 'no_lawns_flowers',
 'no_social_areas',
 'no_sports_facilities',
 'no_facilities_other',
 'no_other']

In [None]:
# extracting participant data
participants_df = dfs['respondents']
participants_df = participants_df[['Respondent ID', 'Language', 'Gender',
                                   'Age', 'NN_awareness', 'NH_frequency',
                                   'NH_activities_living', 'NH_activities_work',
                                   'NH_activities_errands', 'NH_activities_shops',
                                   'NH_activities_sports', 'NH_activities_rec',
                                   'NH_activities_social', 'NH_activities_nature',
                                   'NH_activities_other', 'NH_activities_other_specified',
                                   'NN_HaC_nature', 'NN_HaC_outdoor_rec',
                                   'NN_HaC_experience', 'NN_HaC_housing',
                                   'NN_HaC_access', 'Sports_member',
                                   'Sport_member_specified', 'Enviro_member',
                                   'Enviro_member_specified', 'Language_Danish',
                                   'Language_English', 'Language_German',
                                   'Language_Swedish', 'Language_Arabic',
                                   'Language_Turkish', 'Language_Other',
                                   'Language_Other_Specified', 'Children_under18',
                                   'Higher_edu', 'Current_occupation',
                                   'Current_occupation_other_specified',
                                   'Annual_income']]
print(len(participants_df))
participants_df.head()

4607


Unnamed: 0,Respondent ID,Language,Gender,Age,NN_awareness,NH_frequency,NH_activities_living,NH_activities_work,NH_activities_errands,NH_activities_shops,...,Language_Swedish,Language_Arabic,Language_Turkish,Language_Other,Language_Other_Specified,Children_under18,Higher_edu,Current_occupation,Current_occupation_other_specified,Annual_income
0,8sy4beg8kkh7,en,2.0,37.0,,,1,1,1,1,...,1,1,1,1,,,,,,
1,2sm4ths4cub7,en,1.0,37.0,,,1,1,1,1,...,1,1,1,1,,,,,,
2,9ia8ogl4nxc8,en,1.0,37.0,,,1,1,1,1,...,1,1,1,1,,,,,,
3,4hs22czs7c29,en,,,,,1,1,1,1,...,1,1,1,1,,,,,,
4,4cs6tid4x4ja,en,,,,,1,1,1,1,...,1,1,1,1,,,,,,


In [None]:
# preparing home data
home_df = dfs['home']
home_df = home_df.drop(home_df.index[0])
home_df = home_df[['Respondent ID', 'wkt']]
print(len(home_df))
home_df.head()

3846


Unnamed: 0,Respondent ID,wkt
1,8sy4beg8kkh7,POINT (12.575378 55.706844)
2,2sm4ths4cub7,POINT (12.575323 55.706751)
3,9ia8ogl4nxc8,POINT (12.575343 55.706824)
4,7zfc37wx7lo4,POINT (12.589762 55.700589)
5,37xba4xw9db3,POINT (12.581288 55.69172)


In [None]:
participants_df = pd.merge(participants_df, home_df, on='Respondent ID', how='left')
participants_df = participants_df.drop_duplicates(subset='Respondent ID')
len(participants_df)

4607

In [None]:
# renaming and removing columns
participants_df.rename(columns=lambda col: replace(col, {' ': '_'}), inplace=True)
participants_df.columns

Index(['respondent_id', 'language', 'gender', 'age', 'nn_awareness',
       'nh_frequency', 'nh_activities_living', 'nh_activities_work',
       'nh_activities_errands', 'nh_activities_shops', 'nh_activities_sports',
       'nh_activities_rec', 'nh_activities_social', 'nh_activities_nature',
       'nh_activities_other', 'nh_activities_other_specified', 'nn_hac_nature',
       'nn_hac_outdoor_rec', 'nn_hac_experience', 'nn_hac_housing',
       'nn_hac_access', 'sports_member', 'sport_member_specified',
       'enviro_member', 'enviro_member_specified', 'language_danish',
       'language_english', 'language_german', 'language_swedish',
       'language_arabic', 'language_turkish', 'language_other',
       'language_other_specified', 'children_under18', 'higher_edu',
       'current_occupation', 'current_occupation_other_specified',
       'annual_income', 'wkt'],
      dtype='object')

In [None]:
participants_df['age'] = pd.to_numeric(participants_df['age'])

In [None]:
# Define the dictionary to map old values to new values
gender_mapping = {1: 'Woman',
                  2: 'Man',
                  3: 'Other',
                  4: 'Non-disclosed'}

# Assuming 'gender' is the column you want to rename
participants_df['gender'] = participants_df['gender'].replace(gender_mapping)

In [None]:
income_mapping = {
    1: 'under 200.000kr',
    2: '200.000 - 500.000kr',
    3: '500.000 - 750.000kr',
    4: '750.000 - 1.000.000kr',
    5: 'over 1.000.000 kr'}

participants_df['annual_income'] = participants_df['annual_income'].replace(income_mapping)

In [None]:
employment_mapping = {
    1: 'Employed',
    2: 'Self-Employed',
    3: 'Student',
    4: 'Transfer income',
    5: 'Other'
}

participants_df[ 'current_occupation'] = participants_df[ 'current_occupation'].replace(employment_mapping)

In [None]:
binary_mapping = {1: 'No', 0: 'Yes'}
for col in [ 'higher_edu', 'children_under18', 'sports_member', 'enviro_member', 'nn_awareness']:
  participants_df[col] = participants_df[col].replace(binary_mapping)

## 2.3 Translation and export

In [None]:
# I will translate once dataframe is successfully merged
participants_df['language'].value_counts()

da        3772
en         611
da, en     192
en, da      32
Name: language, dtype: int64

In [None]:
def change_language(text, input_lang='en', target_lang='da'):
    '''Translates text from English to Danish'''
    translated_text = translator.translate_text(text, target_lang=target_lang)
    return translated_text

In [None]:
# LAST RUNTIME: 17/03/2024 13:47
# translate and saving translated df to save on translations - takes ca. 6 min, ca. 59 000 characters
df = participants_df.copy()

for col in ['nn_hac_nature', 'nn_hac_outdoor_rec', 'nn_hac_experience',
            'nn_hac_housing', 'nn_hac_access']:
  condition = (df['language'] == 'en') | (df['language'] == 'en, da')
  df.loc[condition, col] = df.loc[condition, col].apply(lambda x: change_language(str(x)) if x != '' else x)

df.to_csv('participants_translated.csv', index=False)

 # Next up:
**Output data:** 'participants_translated.csv'
 - 3-merge-and-word-embeddings-10-05-2024
 - 4-merged-content-analysis-17-05-2024