In [245]:
import pandas as pd

In [246]:
df = pd.read_csv("OriginalData/languages.csv")
df

Unnamed: 0,user_id,language,proficiency
0,8,İngilizce,full_professional
1,8,Türkçe,native_or_bilingual
2,8,Fransızca,elementary
3,10,ingilizce,
4,11,Turkish,native_or_bilingual
...,...,...,...
76057,66271,English,
76058,66272,English,professional_working
76059,66273,Türkçe,native_or_bilingual
76060,66273,İngilizce,professional_working


Fix up proficiencies

In [247]:
profs = df.proficiency.unique()
profs

array(['full_professional', 'native_or_bilingual', 'elementary', nan,
       'professional_working', 'limited_working'], dtype=object)

Replace with numerical values as such:
- nan:default_nan, currently set as 2
- elementary:1
- limited working:2
- professional working:3
- full_professional:4
- native_or_bilingual:5

In [248]:
default_nan = 2
df.proficiency.replace(profs,[4,5,1,default_nan,3,2],inplace=True)

Clear up Languages

In [249]:
langs = df.language.unique().tolist()
langs.sort()
langs

[' English',
 ' German',
 '-English',
 '1- French',
 '2- English',
 '3- Turkish',
 'ALMANCA',
 'Advanced English',
 'Adıge dili',
 'Afan Oromo',
 'Al Bakiyye & Hûrayca',
 'Albanian',
 'Almanca',
 'Almanca ',
 'Almanca (Düşük Seviye)',
 'Almanca (IAnfänger A2) (Elementary)',
 'Almanca (basic)',
 'Almanca (başlangıç)',
 'Almanca / Deutsch',
 'Almanca B2',
 'Almanca(A1)',
 'Almanca(Beginner)',
 'Almanca, Eski Yüksek (yaklaşık 750-1050)',
 'Almanca, Orta Yüksek (yaklaşık 1050-1500)',
 'Almanca/German',
 'Amazigh (Language of Berber) ',
 'Amharic',
 'Anglais',
 'Antik Yunanca ',
 'Antik Yunanca (1453’e kadar)',
 'Arabe',
 'Arabic',
 'Arabic (only very basic speaking skills)',
 'Arabish',
 'Arapca',
 'Arapca ',
 'Arapça',
 'Arapça ',
 'Arapça(Temel Düzeyde)',
 'Arapça-A1-A2',
 'Armenian',
 'Arnavutça',
 'Assembly',
 'Avaric Маг|арул мац|',
 'Azerbaijani',
 'Azerbaycan Türkçesi',
 'Azerbaycanca',
 'Azeri',
 'Azerice',
 'Azərbaycan',
 'Azərbaycan Dili',
 'BCE',
 'Bahasa Indonesia',
 'Bahasa Me

Group similar languages together, mark all but the most popular languages as "Other"

In [250]:
from fuzzywuzzy import fuzz
from collections import defaultdict

def group_strings(strings, reference_strings, similarity_threshold):
    groups = defaultdict(list)
    for i, string1 in enumerate(reference_strings):
        for j, string2 in enumerate(strings):
            similarity = fuzz.token_set_ratio(string1, string2)
            if similarity >= similarity_threshold:
                groups[string1].append(string2)
    return groups


In [251]:
lang_groups = ["English","German", "Turkish", "French", "Spanish", "Chinese",
"İngilizce", "Almanca", "Türkçe", "Fransizca", "İspanyolca", "Çince"]

In [252]:
groups = group_strings(langs,lang_groups, 75)

In [253]:
for key in groups.keys():
    print("Key: ", key)
    print("Values: ", groups[key])
    print("-------------------")

Key:  English
Values:  [' English', '-English', '2- English', 'Advanced English', 'ENGLISH', 'Engilish,', 'Engish', 'Englis', 'Englisch', 'English', 'English ', 'English (Advanced)', 'English (B2)', 'English (B2, Upper-Intermediate)', 'English (US)', 'English (Upper-Intermediate)', 'English - (YDS : 93,75)', 'English - Global Village Sydney Australia', 'English - Professional working proficiency', 'English C1', 'English UK', 'English US', 'English upper intermediate', 'English(advanced)', 'English,', 'English, Advanced', 'English, Deutsch', 'English, Middle (1100-1500)', 'English, Old (ca.450-1100)', 'English, Pre-Advance', 'English-B2 Upper Intermediate', 'English. French ', 'Turkish (Native Speaker), English (Intermediate)', 'Turkish, English', 'english', 'english ', 'ingilizce(english)', 'İngilizce (English)', 'İngilizce / English', 'İngilizce- BELS english school. as upper intermediate', 'İngilizce/English', '■ English ■']
-------------------
Key:  German
Values:  [' German', 'Alma

In [254]:
other=[]
for lang in langs:
    found=False
    for group in groups.values():
        if lang in group:
            found=True
            break
    if not found:
        other.append(lang)

In [255]:
english = groups["English"] + groups["İngilizce"]
german = groups["German"] + groups["Almanca"]
turkish = groups["Turkish"] + groups["Türkçe"]
french = groups["French"] + groups["Fransizca"]
spanish = groups["Spanish"] + groups["İspanyolca"]
chinese = groups["Chinese"] + groups["Çince"]

Now replace the languages in the dataframe

In [256]:
df.loc[df.language.isin(english), "language"] = "English"
df.loc[df.language.isin(german),"language"] = "German"
df.loc[df.language.isin(turkish),"language"] = "Turkish"
df.loc[df.language.isin(french),"language"] = "French"
df.loc[df.language.isin(spanish),"language"] = "Spanish"
df.loc[df.language.isin(chinese),"language"] = "Chinese"
df.loc[df.language.isin(other),"language"] = "Other"

In [257]:
lang_important = ["English", "German", "Turkish", "French", "Spanish", "Chinese"]

## Further clearing

Set these languages as columns, with their values being the proficiency (0 if no proficiency), add new column #languages known,

In [258]:
df.head()

Unnamed: 0,user_id,language,proficiency
0,8,English,4
1,8,Turkish,5
2,8,French,1
3,10,English,2
4,11,Turkish,5


In [259]:
n_langs = df["user_id"].value_counts().reset_index().rename(columns={'index': 'user_id', 'user_id': 'N_languages'})

In [260]:
n_langs.isna().sum()

user_id        0
N_languages    0
dtype: int64

In [261]:
for lang in lang_important:
    df[lang] = 0

for lang in lang_important:
    mask = df['language'] == lang
    df.loc[mask, lang] = df.loc[mask, 'proficiency']

In [262]:
df.drop(columns=["language", "proficiency"], inplace=True)

In [263]:
df = df.groupby('user_id').sum().reset_index()

In [264]:
df["NLanguages"] = n_langs["N_languages"]

In [265]:
df

Unnamed: 0,user_id,English,German,Turkish,French,Spanish,Chinese,NLanguages
0,8,4,0,5,1,0,0,13
1,10,2,0,0,0,0,0,12
2,11,3,0,5,0,0,0,12
3,12,3,0,5,0,0,0,12
4,13,4,2,0,0,0,0,12
...,...,...,...,...,...,...,...,...
37285,66265,2,0,0,0,0,0,1
37286,66269,2,0,0,0,0,0,1
37287,66271,2,0,0,0,0,0,1
37288,66272,3,0,0,0,0,0,1


Save

In [266]:
df.to_csv("PreparedData/Languages.csv", index=False)