# This is the code I will be building for my thesis project that I am writing at the chair of Public Economics of Prof. Daniel Schunk
# with Dr. Katharina Hartinger as my adviser.

I will be documenting everything using these markdown cells to describe ideas, processes and problems that arise while writing the code. 

All the packages that will be imported to run the code for my thesis will be readable in the cell below.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import enchant


In [12]:
df = pd.read_csv("Game_data_25_04_24.csv")  

# Create the 'Success' column
df['Success'] = df.apply(lambda row: 1 if row['Guess'] == row['Mystery Word'] else 0, axis=1)

df.head()

Unnamed: 0,Table Number,Move,Clues,Mystery Word,Guess,Mode,Speed,Language,End,Success
0,312175859,1609,"Koperta, Naklejony, Pocztowy, Pocztówka",Stamp,Znaczek,,,,,0
1,312175859,1615,"W, Rimmikub, Glazura, Kwadracik",Tile,Płytka,,,,,0
2,312175859,1626,"Grajek, Baśń, Ta, Tuj",Rat,Bard,,,,,0
3,312175859,1632,"Płacony, Restauracja, Kid, Western",Bill,Tip,,,,,0
4,312175859,1638,"Władzy, Złoty, Tytanowy, Obrączka",Ring,Ring,,,,,1


Problem: viele spielen mit Eigennamen, sodass die Sprache nicht richtig erkannt wird. Wie löst man das? z.B. hier: Sabayon, Fin, PannaCotta, Baklava, Tiramisu, tiramisu- algorithmus erkennt es als Englisch


In [18]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")

# Function to detect the language of a single word using pyenchant
def detect_language_single_word(word):
    word = word.strip().lower()
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check + fr_check + de_check >= 2:
        return 'multiple'
    elif en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

# Function to detect the most common language of clues using a weighted voting system for multiple words
def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown':
            lang_counter[lang] += 1
    
    if len(lang_counter) > 1:
        return 'multiple'
    elif lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

# Function to map full language names to short codes
def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Function to compare the clues language to the mystery word/guess language
def compare_languages(row):
    mystery_lang = map_language(row['Mystery_Word_Language'])
    guess_lang = map_language(row['Guess_Language'])
    clues_lang = map_language(row['Clues_Language'])
    
    if clues_lang == 'multiple' or mystery_lang == 'multiple' or guess_lang == 'multiple':
        if (mystery_lang in ['en', 'fr', 'de'] and guess_lang in ['en', 'fr', 'de'] and clues_lang in ['en', 'fr', 'de']):
            return 'yes'
        if 'multiple' in [mystery_lang, guess_lang, clues_lang]:
            matched_languages = row['Matching_Languages'].split(', ')
            if any(lang in matched_languages for lang in ['en', 'fr', 'de']):
                return 'yes'
        return 'no'
    elif clues_lang == mystery_lang == guess_lang:
        return 'yes'
    else:
        return 'no'

# Function to determine which languages matched in case of 'multiple'
def determine_matching_languages(matched_languages):
    return ', '.join(matched_languages) if matched_languages else 'N/A'

# Detect the language of the mystery word, clues, and guesses only for rows with NaN in 'Language'
df['Mystery_Word_Language'] = df.apply(lambda row: detect_language_single_word(row['Mystery Word']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Guess_Language'] = df.apply(lambda row: detect_language_single_word(row['Guess']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Clues_Language'] = df.apply(lambda row: 'unknown' if pd.isna(row['Clues']) else detect_language_clues(row['Clues']), axis=1)

# Drop rows with 'unknown' languages
df = df[(df['Mystery_Word_Language'] != 'unknown') & (df['Guess_Language'] != 'unknown') & (df['Clues_Language'] != 'unknown')]

# Apply the comparison function to all rows
df['Language_Match'] = df.apply(lambda row: compare_languages(row), axis=1)

# Convert 'Matched_Languages_Set' from set to string
df['Matching_Languages'] = df.apply(lambda row: determine_matching_languages(set([row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']])) if 'multiple' in [row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']] else 'N/A', axis=1)

# Check how many times 'Language_Match' is set to 'no'
num_no_matches = df['Language_Match'].value_counts().get('no', 0)
print(f"Number of rows where 'Language_Match' is set to 'no': {num_no_matches}")

# Save the cleaned dataset with the new 'Language_Match' and 'Matching_Languages' columns
df.to_csv('Game_data_cleaned.csv', index=False)

# Display the first few rows of the updated dataframe
df.head()

Number of rows where 'Language_Match' is set to 'no': 1912


Unnamed: 0,Table Number,Move,Clues,Mystery Word,Guess,Mode,Speed,Language,End,Success,Mystery_Word_Language,Guess_Language,Clues_Language,Matching_Languages,Language_Match
2,312175859,1626,"Grajek, Baśń, Ta, Tuj",Rat,Bard,,,,,0,multiple,multiple,multiple,multiple,no
3,312175859,1632,"Płacony, Restauracja, Kid, Western",Bill,Tip,,,,,0,multiple,en,multiple,"multiple, en",yes
7,312175859,1654,"Mandala, Gra, Szary, Twardy",Stone,Stone,,,,,1,en,en,en,,yes
8,312552889,1654,"Time, Mechanical, Invention, Automated, automated",Machine,Clock,,,,,0,multiple,en,multiple,"multiple, en",yes
9,312552889,1671,"Craft, Stalls, ""Farmers"", ""Farmers"", Farmers",Market,Fair,,,,,0,en,multiple,en,"multiple, en",yes


In [5]:
# Count the number of NaN entries in the 'Language' column
num_nan_language = df['Language'].isna().sum()

print(f"Number of NaN entries in the 'Language' column: {num_nan_language}")

Number of NaN entries in the 'Language' column: 8373


In [14]:
df_matched = pd.read_csv("Game_data_with_language_match.csv") 

Create new dataset using only the clues.


In [7]:
# Create a new DataFrame with only the clues, remove duplicates
clues_df = df[['Clues']].dropna().drop_duplicates().reset_index(drop=True)

# Save the unique clues DataFrame to a new CSV file
clues_df.to_csv('Game_data_unique_clues.csv', index=False)

# Display the first few rows of the unique clues DataFrame
print(clues_df.head())

                                     Clues
0  Koperta, Naklejony, Pocztowy, Pocztówka
1          W, Rimmikub, Glazura, Kwadracik
2                    Grajek, Baśń, Ta, Tuj
3       Płacony, Restauracja, Kid, Western
4        Władzy, Złoty, Tytanowy, Obrączka


Create dataset with only the mystery words in it

In [6]:
# Create a new DataFrame with only the mystery words
mystery_words_df = df[['Mystery Word']].dropna().reset_index(drop=True)

# Save the mystery words DataFrame to a new CSV file
mystery_words_df.to_csv('Game_data_mystery_words.csv', index=False)

# Display the first few rows of the mystery words DataFrame
print(mystery_words_df.head())

  Mystery Word
0        Stamp
1         Tile
2          Rat
3         Bill
4         Ring
