In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("OriginalData/languages.csv")
df

Fix up proficiencies

In [None]:
profs = df.proficiency.unique()
profs

Replace with numerical values as such:
- nan:default_nan, currently set as 2
- elementary:1
- limited working:2
- professional working:3
- full_professional:4
- native_or_bilingual:5

In [None]:
default_nan = 2
df.proficiency.replace(profs,[4,5,1,default_nan,3,2],inplace=True)

Clear up Languages

In [None]:
langs = df.language.unique().tolist()
langs.sort()
langs

Group similar languages together, mark all but the most popular languages as "Other"

In [None]:
from fuzzywuzzy import fuzz
from collections import defaultdict

def group_strings(strings, reference_strings, similarity_threshold):
    groups = defaultdict(list)
    for i, string1 in enumerate(reference_strings):
        for j, string2 in enumerate(strings):
            similarity = fuzz.token_set_ratio(string1, string2)
            if similarity >= similarity_threshold:
                groups[string1].append(string2)
    return groups


In [None]:
lang_groups = ["English","German", "Turkish", "French", "Spanish", "Chinese",
"İngilizce", "Almanca", "Türkçe", "Fransizca", "İspanyolca", "Çince"]

In [None]:
groups = group_strings(langs,lang_groups, 75)

In [None]:
for key in groups.keys():
    print("Key: ", key)
    print("Values: ", groups[key])
    print("-------------------")

In [None]:
other=[]
for lang in langs:
    found=False
    for group in groups.values():
        if lang in group:
            found=True
            break
    if not found:
        other.append(lang)

In [None]:
english = groups["English"] + groups["İngilizce"]
german = groups["German"] + groups["Almanca"]
turkish = groups["Turkish"] + groups["Türkçe"]
french = groups["French"] + groups["Fransizca"]
spanish = groups["Spanish"] + groups["İspanyolca"]
chinese = groups["Chinese"] + groups["Çince"]

Now replace the languages in the dataframe

In [None]:
df.loc[df.language.isin(english), "language"] = "English"
df.loc[df.language.isin(german),"language"] = "German"
df.loc[df.language.isin(turkish),"language"] = "Turkish"
df.loc[df.language.isin(french),"language"] = "French"
df.loc[df.language.isin(spanish),"language"] = "Spanish"
df.loc[df.language.isin(chinese),"language"] = "Chinese"
df.loc[df.language.isin(other),"language"] = "Other"

In [None]:
lang_important = ["English", "German", "Turkish", "French", "Spanish", "Chinese"]

## Further clearing

Set these languages as columns, with their values being the proficiency (0 if no proficiency), add new column #languages known,

In [None]:
df.head()

In [None]:
n_langs = df["user_id"].value_counts().reset_index().rename(columns={'index': 'user_id', 'user_id': 'N_languages'})

In [None]:
n_langs.isna().sum()

In [None]:
for lang in lang_important:
    df[lang] = 0

for lang in lang_important:
    mask = df['language'] == lang
    df.loc[mask, lang] = df.loc[mask, 'proficiency']

In [None]:
df.drop(columns=["language", "proficiency"], inplace=True)

In [None]:
df = df.groupby('user_id').sum().reset_index()

In [None]:
df["NLanguages"] = n_langs["N_languages"]

In [None]:
df

Save

In [None]:
df.to_csv("PreparedData/languages.csv", index=False)