In this notebook, I will put several pieces of code that didn't end up in the final code for my thesis. In some cases, there is extensive documentation for the code as I intended to leave it in the main file, but in other cases, especially if the code was tossed quickly, the documentation is not as extensive.

In [1]:
import pandas as pd
import statsmodels.api as sm

# Assume df is your dataframe
# Create a dummy variable for missing 'Mode' values
df['Mode_Missing'] = df['Mode'].isna().astype(int)

# Convert 'Mode' to dummy variables, keeping NaN as is
df_with_dummies = pd.get_dummies(df[['Mode', 'Speed', 'Language', 'Mode_Missing']], columns=['Mode', 'Language'], drop_first=True)

# Prepare the predictors (X) and the response variable (y)
X = df_with_dummies
y = df['Success']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the logistic regression model
model = sm.Logit(y, X).fit()

# Print the summary of the regression
print(model.summary())

NameError: name 'df' is not defined

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from langdetect import detect

# Load the dataset
df = pd.read_csv('/mnt/data/Game_data_25_04_24.csv')

# Create a dummy variable for missing 'Mode' values
df['Mode_Missing'] = df['Mode'].isna().astype(int)

# Convert 'Mode' and 'Language' to dummy variables, including NaN as a separate category
df = pd.get_dummies(df, columns=['Mode', 'Language'], drop_first=True, dummy_na=True)

# Ensure all columns are numeric
df = df.apply(pd.to_numeric, errors='coerce')

# Calculate number of unique clues
df['Num_Clues'] = df['Clues'].apply(lambda x: len(x.split(',')) if pd.notna(x) else 0)

# Detect the language of the guess and the mystery word, and compare
def detect_language(row):
    try:
        guess_language = detect(row['Guess']) if pd.notna(row['Guess']) else 'unknown'
        mystery_word_language = detect(row['Mystery Word']) if pd.notna(row['Mystery Word']) else 'unknown'
        return 1 if guess_language == mystery_word_language else 0
    except:
        return 0

df['Language_Match'] = df.apply(detect_language, axis=1)

# Define the dependent variable and independent variables
X = df[['Num_Clues', 'Speed', 'Language_Match', 'Mode_Missing'] + [col for col in df.columns if 'Mode_' in col or 'Language_' in col]]
y = df['Success']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Check for any remaining non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {non_numeric_columns}")
    X[non_numeric_columns] = X[non_numeric_columns].apply(pd.to_numeric, errors='coerce')

# Ensure there are no missing values
X = X.fillna(0)

# Fit the logistic regression model using statsmodels
logit_model = sm.Logit(y, X).fit()

# Print the model summary
print(logit_model.summary())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and fit the logistic regression model using scikit-learn
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("fr_FR")
d_de = enchant.Dict("de_DE")

# Function to detect the language of a single word using pyenchant
def detect_language_single_word(word):
    word = word.strip().lower()
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check + fr_check + de_check > 1:
        return 'multiple'
    elif en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

# Function to detect the most common language of clues using a weighted voting system for multiple words
def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown' and lang != 'multiple':
            lang_counter[lang] += 1
    
    if lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

# Function to map full language names to short codes
def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Function to compare the clues language to the mystery word/guess language
def compare_languages(row):
    mystery_lang = map_language(row['Mystery_Word_Language'])
    guess_lang = map_language(row['Guess_Language'])
    clues_lang = map_language(row['Clues_Language'])
    
    if clues_lang == 'multiple' or mystery_lang == 'multiple' or guess_lang == 'multiple':
        if (mystery_lang in ['en', 'fr', 'de'] and guess_lang in ['en', 'fr', 'de'] and clues_lang in ['en', 'fr', 'de']):
            return 'yes'
        if 'multiple' in [mystery_lang, guess_lang, clues_lang]:
            matched_languages = row['Matching_Languages'].split(', ')
            if any(lang in matched_languages for lang in ['en', 'fr', 'de']):
                return 'yes'
        return 'no'
    elif clues_lang == mystery_lang == guess_lang:
        return 'yes'
    else:
        return 'no'

# Function to determine which languages matched in case of 'multiple'
def determine_matching_languages(matched_languages):
    return ', '.join(matched_languages) if matched_languages else 'N/A'

# Load the dataset
df = pd.read_csv('/mnt/data/Game_data_25_04_24.csv')

# Detect the language of the mystery word, clues, and guesses only for rows with NaN in 'Language'
df['Mystery_Word_Language'] = df.apply(lambda row: detect_language_single_word(row['Mystery Word']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Guess_Language'] = df.apply(lambda row: detect_language_single_word(row['Guess']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Clues_Language'] = df.apply(lambda row: 'unknown' if pd.isna(row['Clues']) else detect_language_clues(row['Clues']), axis=1)

# Drop rows with 'unknown' languages
df = df[(df['Mystery_Word_Language'] != 'unknown') & (df['Guess_Language'] != 'unknown') & (df['Clues_Language'] != 'unknown')]

# Apply the comparison function only to rows with NaN in 'Language'
df['Language_Match'] = df.apply(lambda row: compare_languages(row) if pd.isna(row['Language']) else 'N/A', axis=1)

# Convert 'Matched_Languages_Set' from set to string
df['Matching_Languages'] = df.apply(lambda row: determine_matching_languages(set([row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']])) if 'multiple' in [row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']] else 'N/A', axis=1)

# Save the cleaned dataset with the new 'Language_Match' and 'Matching_Languages' columns
df.to_csv('/mnt/data/Game_data_cleaned.csv', index=False)

# Display the first few rows of the updated dataframe
df.head()

Clues cannot be declared as multiple, code below fixes that

In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("fr_FR")
d_de = enchant.Dict("de_DE")

# Function to detect the language of a single word using pyenchant
def detect_language_single_word(word):
    word = word.strip().lower()
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check + fr_check + de_check > 1:
        return 'multiple'
    elif en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

# Function to detect the most common language of clues using a weighted voting system for multiple words
def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown':
            lang_counter[lang] += 1
    
    if len(lang_counter) > 1:
        return 'multiple'
    elif lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

# Function to map full language names to short codes
def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Function to compare the clues language to the mystery word/guess language
def compare_languages(row):
    mystery_lang = map_language(row['Mystery_Word_Language'])
    guess_lang = map_language(row['Guess_Language'])
    clues_lang = map_language(row['Clues_Language'])
    
    if clues_lang == 'multiple' or mystery_lang == 'multiple' or guess_lang == 'multiple':
        if (mystery_lang in ['en', 'fr', 'de'] and guess_lang in ['en', 'fr', 'de'] and clues_lang in ['en', 'fr', 'de']):
            return 'yes'
        if 'multiple' in [mystery_lang, guess_lang, clues_lang]:
            matched_languages = row['Matching_Languages'].split(', ')
            if any(lang in matched_languages for lang in ['en', 'fr', 'de']):
                return 'yes'
        return 'no'
    elif clues_lang == mystery_lang == guess_lang:
        return 'yes'
    else:
        return 'no'

# Function to determine which languages matched in case of 'multiple'
def determine_matching_languages(matched_languages):
    return ', '.join(matched_languages) if matched_languages else 'N/A'

# Load the dataset
df = pd.read_csv('/mnt/data/Game_data_25_04_24.csv')

# Detect the language of the mystery word, clues, and guesses only for rows with NaN in 'Language'
df['Mystery_Word_Language'] = df.apply(lambda row: detect_language_single_word(row['Mystery Word']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Guess_Language'] = df.apply(lambda row: detect_language_single_word(row['Guess']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Clues_Language'] = df.apply(lambda row: 'unknown' if pd.isna(row['Clues']) else detect_language_clues(row['Clues']), axis=1)

# Drop rows with 'unknown' languages
df = df[(df['Mystery_Word_Language'] != 'unknown') & (df['Guess_Language'] != 'unknown') & (df['Clues_Language'] != 'unknown')]

# Apply the comparison function only to rows with NaN in 'Language'
df['Language_Match'] = df.apply(lambda row: compare_languages(row) if pd.isna(row['Language']) else 'N/A', axis=1)

# Convert 'Matched_Languages_Set' from set to string
df['Matching_Languages'] = df.apply(lambda row: determine_matching_languages(set([row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']])) if 'multiple' in [row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']] else 'N/A', axis=1)

# Save the cleaned dataset with the new 'Language_Match' and 'Matching_Languages' columns
df.to_csv('/mnt/data/Game_data_cleaned.csv', index=False)

# Display the first few rows of the updated dataframe
df.head()

### Documentation (as of August 2, 2024)

1. **Initialize Dictionaries**:
   - The script initializes dictionaries for English, French, and German using the `pyenchant` library.

2. **Function: `detect_language_single_word`**:
   - This function checks if a word exists in any of the three dictionaries.
   - It returns `'multiple'` if the word is found in two or more dictionaries.
   - It returns `'en'`, `'fr'`, or `'de'` if the word is found in the English, French, or German dictionary, respectively.
   - It returns `'unknown'` if the word is not found in any dictionary.

3. **Function: `detect_language_clues`**:
   - This function detects the most common language of the clues using a weighted voting system.
   - It splits the clues into individual words and checks each word using `detect_language_single_word`.
   - It returns `'multiple'` if words from more than one language are found among the clues.
   - It returns the most common language if all words are from one language.
   - It returns `'unknown'` if no valid words are found.

4. **Function: `map_language`**:
   - This function maps full language names to their short codes.
   - It converts 'English' to 'en', 'Français' to 'fr', and 'Deutsch' to 'de'.

5. **Function: `compare_languages`**:
   - This function compares the languages of the mystery word, guess, and clues.
   - It returns `'yes'` if the languages match or if they are valid combinations involving `'multiple'`.
   - It returns `'no'` if the languages do not match.

6. **Function: `determine_matching_languages`**:
   - This function determines which languages matched in the case of `'multiple'`.
   - It returns a comma-separated list of the matched languages or 'N/A' if no languages matched.

7. **Language Detection**:
   - The script detects the language of the mystery word, guess, and clues only for rows with NaN in 'Language'.
   - The languages are mapped using `map_language` if the 'Language' column is not NaN.

8. **Filter Rows**:
   - Rows with 'unknown' languages are dropped from the dataset.

9. **Determine Matching Languages**:
   - The `determine_matching_languages` function is applied to all rows to create the 'Matching_Languages' column.

10. **Compare Languages**:
    - The `compare_languages` function is applied to all rows to create the 'Language_Match' column.

11. **Statistics**:
    - The script prints the number of rows where 'Language_Match' is set to 'no'.

12. **Save the Dataset**:
    - The cleaned dataset, including the new 'Language_Match' and 'Matching_Languages' columns, is saved to `Game_data_cleaned.csv`.

13. **Display Valid Rounds**:
    - The first few rows of the valid rounds dataframe are displayed for verification.


In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")

# Function to detect the language of a single word using pyenchant
def detect_language_single_word(word):
    word = word.strip().lower()
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check + fr_check + de_check >= 2:
        return 'multiple'
    elif en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

# Function to detect the most common language of clues using a weighted voting system for multiple words
def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown':
            lang_counter[lang] += 1
    
    if len(lang_counter) > 1:
        return 'multiple'
    elif lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

# Function to map full language names to short codes
def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Function to compare the clues language to the mystery word/guess language
def compare_languages(row):
    mystery_lang = map_language(row['Mystery_Word_Language'])
    guess_lang = map_language(row['Guess_Language'])
    clues_lang = map_language(row['Clues_Language'])
    
    if clues_lang == 'multiple' or mystery_lang == 'multiple' or guess_lang == 'multiple':
        if (mystery_lang in ['en', 'fr', 'de'] and guess_lang in ['en', 'fr', 'de'] and clues_lang in ['en', 'fr', 'de']):
            return 'yes'
        if 'multiple' in [mystery_lang, guess_lang, clues_lang]:
            matched_languages = row['Matching_Languages'].split(', ')
            if any(lang in matched_languages for lang in ['en', 'fr', 'de']):
                return 'yes'
        return 'no'
    elif clues_lang == mystery_lang == guess_lang:
        return 'yes'
    else:
        return 'no'

# Function to determine which languages matched in case of 'multiple'
def determine_matching_languages(row):
    matched_languages = set()
    if row['Mystery_Word_Language'] == 'multiple':
        matched_languages.update(['en', 'fr', 'de'])
    if row['Guess_Language'] == 'multiple':
        matched_languages.update(['en', 'fr', 'de'])
    if row['Clues_Language'] == 'multiple':
        matched_languages.update(['en', 'fr', 'de'])
    return ', '.join(matched_languages) if matched_languages else 'N/A'

# Detect the language of the mystery word, clues, and guesses only for rows with NaN in 'Language'
df['Mystery_Word_Language'] = df.apply(lambda row: detect_language_single_word(row['Mystery Word']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Guess_Language'] = df.apply(lambda row: detect_language_single_word(row['Guess']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Clues_Language'] = df.apply(lambda row: 'unknown' if pd.isna(row['Clues']) else detect_language_clues(row['Clues']), axis=1)

# Drop rows with 'unknown' languages
df = df[(df['Mystery_Word_Language'] != 'unknown') & (df['Guess_Language'] != 'unknown') & (df['Clues_Language'] != 'unknown')]

# Determine matching languages for all rows
df['Matching_Languages'] = df.apply(determine_matching_languages, axis=1)

# Apply the comparison function to all rows
df['Language_Match'] = df.apply(compare_languages, axis=1)

# Check how many times 'Language_Match' is set to 'no'
num_no_matches = df['Language_Match'].value_counts().get('no', 0)
print(f"Number of rows where 'Language_Match' is set to 'no': {num_no_matches}")

# Save the cleaned dataset with the new 'Language_Match' and 'Matching_Languages' columns
df.to_csv('/mnt/data/Game_data_cleaned.csv', index=False)

# Display the first few rows of the updated dataframe
df.head()

Thrown out: 02.08.2024

In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")

# Global counter for invalid words
invalid_word_count = 0

def word_in_dictionaries(word):
    word = word.strip().lower()
    return d_en.check(word) or d_fr.check(word) or d_de.check(word)

def validate_round(row):
    global invalid_word_count
    
    mystery_word_valid = word_in_dictionaries(row['Mystery Word'])
    if not mystery_word_valid:
        invalid_word_count += 1

    guess_valid = word_in_dictionaries(row['Guess'])
    if not guess_valid:
        invalid_word_count += 1

    clues_valid = True
    for word in row['Clues'].split(','):
        if not word_in_dictionaries(word):
            invalid_word_count += 1
            clues_valid = False

    return mystery_word_valid and guess_valid and clues_valid

# Validate each round
df['Valid_Round'] = df.apply(validate_round, axis=1)

# Split the dataset into valid and rejected rounds
valid_rounds = df[df['Valid_Round']]
rejected_rounds = df[~df['Valid_Round']]

# Drop the helper column
valid_rounds = valid_rounds.drop(columns=['Valid_Round'])
rejected_rounds = rejected_rounds.drop(columns=['Valid_Round'])

# Save the datasets
valid_rounds.to_csv('/mnt/data/Game_data_valid.csv', index=False)
rejected_rounds.to_csv('/mnt/data/rejected_rounds.csv', index=False)

# Display the first few rows of the valid rounds dataframe
valid_rounds.head()

# Calculate the number of words rejected
num_rejected_words = (
    rejected_rounds['Mystery Word'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Guess'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Clues'].apply(lambda x: len(x.split(',')))
).sum()

# Printout of the number of rounds and words tossed out
num_rejected_rounds = len(rejected_rounds)
print(f"Number of rounds rejected: {num_rejected_rounds}")
print(f"Number of words rejected: {num_rejected_words}")
print(f"Number of invalid words: {invalid_word_count}")



In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")# Global counter for invalid words

invalid_word_count = 0

def word_in_dictionaries(word):
    word = word.strip().lower().replace('"', '').replace('-', '')
    if word == "":
        return False
    return d_en.check(word) or d_fr.check(word) or d_de.check(word)

def validate_round(row):
    global invalid_word_count
    
    mystery_word_valid = word_in_dictionaries(row['Mystery Word'])
    if not mystery_word_valid:
        invalid_word_count += 1

    guess_valid = word_in_dictionaries(row['Guess'])
    if not guess_valid:
        invalid_word_count += 1

    clues_valid = True
    for word in row['Clues'].split(','):
        if not word_in_dictionaries(word):
            invalid_word_count += 1
            clues_valid = False

    return mystery_word_valid and guess_valid and clues_valid

# Validate each round
df['Valid_Round'] = df.apply(validate_round, axis=1)

# Split the dataset into valid and rejected rounds
valid_rounds = df[df['Valid_Round']]
rejected_rounds = df[~df['Valid_Round']]

# Drop the helper column
valid_rounds = valid_rounds.drop(columns=['Valid_Round'])
rejected_rounds = rejected_rounds.drop(columns=['Valid_Round'])

# Save the datasets
valid_rounds.to_csv('/mnt/data/Game_data_valid.csv', index=False)
rejected_rounds.to_csv('/mnt/data/rejected_rounds.csv', index=False)

# Display the first few rows of the valid rounds dataframe
valid_rounds.head()

# Calculate the number of words rejected
num_rejected_words = (
    rejected_rounds['Mystery Word'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Guess'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Clues'].apply(lambda x: len(x.split(',')))
).sum()

# Printout of the number of rounds and words tossed out
num_rejected_rounds = len(rejected_rounds)
print(f"Number of rounds rejected: {num_rejected_rounds}")
print(f"Number of words rejected: {num_rejected_words}")
print(f"Number of invalid words: {invalid_word_count}")

23.08.2024

### Documentation

1. **Initialize Dictionaries**:
   - The script initializes dictionaries for English, French, and German using the `pyenchant` library.

2. **Function: `word_in_dictionaries`**:
   - This function checks if a word exists in any of the three dictionaries.
   - It returns `True` if the word is found in at least one dictionary and `False` otherwise.

3. **Function: `validate_round`**:
   - This function validates all words in a round (mystery word, guess, and clues).
   - It splits the clues into individual words and checks each word.
   - The function returns `True` if all words in the round exist in the dictionaries, otherwise `False`.

4. **Load Dataset**:
   - The dataset is loaded from a CSV file.

5. **Validate Each Round**:
   - The script applies the `validate_round` function to each row in the dataframe to determine if the round is valid.

6. **Split the Dataset**:
   - The dataset is split into valid and rejected rounds based on the `Valid_Round` column.

7. **Save the Datasets**:
   - The valid rounds are saved to `Game_data_valid.csv`.
   - The rejected rounds are saved to `rejected_rounds.csv`.

8. **Print Statistics**:
   - The script prints the number of rounds and words rejected.

Another problem that arose is the problem of names. Many names exist in the dictionaries, like Sheldon and Einstein. However, some do not. For example, mystery word is 'Chevalier'- there is a singer duo in France called 'Chevalier du fiel', the clue then being 'du-fiel'- a very real clue, but undetectable by an algorithm. Will most likely drop those rounds as the effort to reward ratio would be very low. Dolittle is another example- a very good clue that the mystery word is 'Doctor', but rejected by the algorithm.

Many players also used different symbols, like " and -. The code will be adjusted in a way that it ignores those symbols- as long as they are right next to a word to avoid the cases in which there is only a symbol without any real clue. A problem to think about here: some players might have used '-' to get around the restriction of only being allowed to type a single word- 'du-fiel' comes to mind again. Both exist as a single word, but together the algorithm tosses them- in such a case rightfully so, unless one defines brand names as single words. 'Chick-Fil-A' poses the same problem. But again: effort to reward ratio just not good enough.

A problem that only revealed itself after carefully pruning the rejected rounds is the missing ability to detect plural. To work around this, the package nltk is used, lemmatizing the words into their singular form for all three languages.

An entirely different point of contention, especially within the German language is the use, or rather the creation, of new words that do not exist in dictionaries. German as a language makes it very easy to simply combine words together to convey meaning- something that works very well with humans but of course poses a big problem to algorithms. Since it is almost impossible to work around this using an algorithm, the question is whether German rounds could be dropped altogether.

In [None]:
import pandas as pd
import enchant
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
import nltk

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def get_lemma(word, lang):
    if lang == 'de':
        return lemmatizer.lemmatize(word, pos=wn.NOUN)
    elif lang == 'fr':
        # Use WordNet lemmatizer for French or implement custom logic if necessary
        return lemmatizer.lemmatize(word, pos=wn.NOUN)
    elif lang == 'en':
        return lemmatizer.lemmatize(word, pos=wn.NOUN)
    return word

# Global counter for invalid words
invalid_word_count = 0

def word_in_dictionaries(word):
    word = word.strip().lower().replace('"', '').replace('-', '')
    if word == "":
        return False
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(get_lemma(word, 'de'))  # Lemmatize German words
    if not en_check:
        en_check = d_en.check(get_lemma(word, 'en'))  # Lemmatize English words
    if not fr_check:
        fr_check = d_fr.check(get_lemma(word, 'fr'))  # Lemmatize French words
    return en_check or fr_check or de_check

def validate_round(row):
    global invalid_word_count
    
    mystery_word_valid = word_in_dictionaries(row['Mystery Word'])
    if not mystery_word_valid:
        invalid_word_count += 1

    guess_valid = word_in_dictionaries(row['Guess'])
    if not guess_valid:
        invalid_word_count += 1

    clues_valid = True
    for word in row['Clues'].split(','):
        if not word_in_dictionaries(word):
            invalid_word_count += 1
            clues_valid = False

    return mystery_word_valid and guess_valid and clues_valid

# Validate each round
df['Valid_Round'] = df.apply(validate_round, axis=1)

# Split the dataset into valid and rejected rounds
valid_rounds = df[df['Valid_Round']]
rejected_rounds = df[~df['Valid_Round']]

# Drop the helper column
valid_rounds = valid_rounds.drop(columns=['Valid_Round'])
rejected_rounds = rejected_rounds.drop(columns=['Valid_Round'])

# Save the datasets
valid_rounds.to_csv('/mnt/data/Game_data_valid.csv', index=False)
rejected_rounds.to_csv('/mnt/data/rejected_rounds.csv', index=False)

# Display the first few rows of the valid rounds dataframe
valid_rounds.head()

# Calculate the number of words rejected
num_rejected_words = (
    rejected_rounds['Mystery Word'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Guess'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Clues'].apply(lambda x: len(x.split(',')))
).sum()

# Printout of the number of rounds and words tossed out
num_rejected_rounds = len(rejected_rounds)
print(f"Number of rounds rejected: {num_rejected_rounds}")
print(f"Number of words rejected: {num_rejected_words}")
print(f"Number of invalid words: {invalid_word_count}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\domin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\domin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Number of rounds rejected: 9593
Number of words rejected: 70965
Number of invalid words: 21316


In [None]:
def detect_language_single_word(word):
    word = word.strip().lower().replace('"', '').replace('-', '')
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown':
            lang_counter[lang] += 1
    
    if lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Load the valid rounds dataset
valid_rounds = pd.read_csv('/mnt/data/Game_data_valid.csv')

# Detect the language only for rows with NaN in 'Language'
valid_rounds['detected'] = valid_rounds.apply(
    lambda row: detect_language_clues(row['Clues']) if pd.isna(row['Language']) else map_language(row['Language']), 
    axis=1
)

# Save the updated dataset
valid_rounds.to_csv('/mnt/data/Game_data_valid_language_match.csv', index=False)

# Display the first few rows of the updated dataframe
valid_rounds.head()

Unnamed: 0,Table Number,Move,Clues,Mystery Word,Guess,Mode,Speed,Language,End,Success,detected
0,312552889,1654,"Time, Mechanical, Invention, Automated, automated",Machine,Clock,,,,,0,en
1,312552889,1671,"Craft, Stalls, ""Farmers"", ""Farmers"", Farmers",Market,Fair,,,,,0,en
2,312552889,1689,"Blower, Frond, Paper, Green, pile",Leaf,leaf,,,,,0,en
3,312552889,1697,"magnetism, Romance, Smitten, Magnetic, Magnetic",Attraction,Attraction,,,,,1,en
4,312552889,1731,"Representatives, Dog, bungalow, Residence, Boat",House,House,,,,,1,en


In [None]:
# Count the number of NaN entries in the 'Language' column
num_nan_language = df['Language'].isna().sum()

print(f"Number of NaN entries in the 'Language' column: {num_nan_language}")

In [None]:
df_matched = pd.read_csv("Game_data_with_language_match.csv") 

Create new dataset using only the clues.


In [None]:
# Create a new DataFrame with only the clues, remove duplicates
clues_df = df[['Clues']].dropna().drop_duplicates().reset_index(drop=True)

# Save the unique clues DataFrame to a new CSV file
clues_df.to_csv('Game_data_unique_clues.csv', index=False)

# Display the first few rows of the unique clues DataFrame
print(clues_df.head())

Create dataset with only the mystery words in it

In [None]:
# Create a new DataFrame with only the mystery words
mystery_words_df = df[['Mystery Word']].dropna().reset_index(drop=True)

# Save the mystery words DataFrame to a new CSV file
mystery_words_df.to_csv('Game_data_mystery_words.csv', index=False)

# Display the first few rows of the mystery words DataFrame
print(mystery_words_df.head())

All the packages that will be imported to run the code for my thesis will be readable in the cell below.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import enchant


In [None]:
df = pd.read_csv("Game_data_25_04_24.csv")  

# Create the 'Success' column
df['Success'] = df.apply(lambda row: 1 if row['Guess'] == row['Mystery Word'] else 0, axis=1)

df.head()

Unnamed: 0,Table Number,Move,Clues,Mystery Word,Guess,Mode,Speed,Language,End,Success
0,312175859,1609,"Koperta, Naklejony, Pocztowy, Pocztówka",Stamp,Znaczek,,,,,0
1,312175859,1615,"W, Rimmikub, Glazura, Kwadracik",Tile,Płytka,,,,,0
2,312175859,1626,"Grajek, Baśń, Ta, Tuj",Rat,Bard,,,,,0
3,312175859,1632,"Płacony, Restauracja, Kid, Western",Bill,Tip,,,,,0
4,312175859,1638,"Władzy, Złoty, Tytanowy, Obrączka",Ring,Ring,,,,,1


Problem: viele spielen mit Eigennamen, sodass die Sprache nicht richtig erkannt wird. Wie löst man das? z.B. hier: Sabayon, Fin, PannaCotta, Baklava, Tiramisu, tiramisu- algorithmus erkennt es als Englisch


In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")

# Function to detect the language of a single word using pyenchant
def detect_language_single_word(word):
    word = word.strip().lower()
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check + fr_check + de_check >= 2:
        return 'multiple'
    elif en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

# Function to detect the most common language of clues using a weighted voting system for multiple words
def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown':
            lang_counter[lang] += 1
    
    if len(lang_counter) > 1:
        return 'multiple'
    elif lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

# Function to map full language names to short codes
def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Function to compare the clues language to the mystery word/guess language
def compare_languages(row):
    mystery_lang = map_language(row['Mystery_Word_Language'])
    guess_lang = map_language(row['Guess_Language'])
    clues_lang = map_language(row['Clues_Language'])
    
    if clues_lang == 'multiple' or mystery_lang == 'multiple' or guess_lang == 'multiple':
        if (mystery_lang in ['en', 'fr', 'de'] and guess_lang in ['en', 'fr', 'de'] and clues_lang in ['en', 'fr', 'de']):
            return 'yes'
        if 'multiple' in [mystery_lang, guess_lang, clues_lang]:
            matched_languages = row['Matching_Languages'].split(', ')
            if any(lang in matched_languages for lang in ['en', 'fr', 'de']):
                return 'yes'
        return 'no'
    elif clues_lang == mystery_lang == guess_lang:
        return 'yes'
    else:
        return 'no'

# Function to determine which languages matched in case of 'multiple'
def determine_matching_languages(matched_languages):
    return ', '.join(matched_languages) if matched_languages else 'N/A'

# Detect the language of the mystery word, clues, and guesses only for rows with NaN in 'Language'
df['Mystery_Word_Language'] = df.apply(lambda row: detect_language_single_word(row['Mystery Word']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Guess_Language'] = df.apply(lambda row: detect_language_single_word(row['Guess']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Clues_Language'] = df.apply(lambda row: 'unknown' if pd.isna(row['Clues']) else detect_language_clues(row['Clues']), axis=1)

# Drop rows with 'unknown' languages
df = df[(df['Mystery_Word_Language'] != 'unknown') & (df['Guess_Language'] != 'unknown') & (df['Clues_Language'] != 'unknown')]

# Apply the comparison function to all rows
df['Language_Match'] = df.apply(lambda row: compare_languages(row), axis=1)

# Convert 'Matched_Languages_Set' from set to string
df['Matching_Languages'] = df.apply(lambda row: determine_matching_languages(set([row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']])) if 'multiple' in [row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']] else 'N/A', axis=1)

# Check how many times 'Language_Match' is set to 'no'
num_no_matches = df['Language_Match'].value_counts().get('no', 0)
print(f"Number of rows where 'Language_Match' is set to 'no': {num_no_matches}")

# Save the cleaned dataset with the new 'Language_Match' and 'Matching_Languages' columns
df.to_csv('Game_data_cleaned.csv', index=False)

# Display the first few rows of the updated dataframe
df.head()

Number of rows where 'Language_Match' is set to 'no': 1912


Unnamed: 0,Table Number,Move,Clues,Mystery Word,Guess,Mode,Speed,Language,End,Success,Mystery_Word_Language,Guess_Language,Clues_Language,Matching_Languages,Language_Match
2,312175859,1626,"Grajek, Baśń, Ta, Tuj",Rat,Bard,,,,,0,multiple,multiple,multiple,multiple,no
3,312175859,1632,"Płacony, Restauracja, Kid, Western",Bill,Tip,,,,,0,multiple,en,multiple,"multiple, en",yes
7,312175859,1654,"Mandala, Gra, Szary, Twardy",Stone,Stone,,,,,1,en,en,en,,yes
8,312552889,1654,"Time, Mechanical, Invention, Automated, automated",Machine,Clock,,,,,0,multiple,en,multiple,"multiple, en",yes
9,312552889,1671,"Craft, Stalls, ""Farmers"", ""Farmers"", Farmers",Market,Fair,,,,,0,en,multiple,en,"multiple, en",yes


In [None]:
# Count the number of NaN entries in the 'Language' column
num_nan_language = df['Language'].isna().sum()

print(f"Number of NaN entries in the 'Language' column: {num_nan_language}")

Number of NaN entries in the 'Language' column: 8373


In [None]:
df_matched = pd.read_csv("Game_data_with_language_match.csv") 

Create new dataset using only the clues.


In [None]:
# Create a new DataFrame with only the clues, remove duplicates
clues_df = df[['Clues']].dropna().drop_duplicates().reset_index(drop=True)

# Save the unique clues DataFrame to a new CSV file
clues_df.to_csv('Game_data_unique_clues.csv', index=False)

# Display the first few rows of the unique clues DataFrame
print(clues_df.head())

                                     Clues
0  Koperta, Naklejony, Pocztowy, Pocztówka
1          W, Rimmikub, Glazura, Kwadracik
2                    Grajek, Baśń, Ta, Tuj
3       Płacony, Restauracja, Kid, Western
4        Władzy, Złoty, Tytanowy, Obrączka


Create dataset with only the mystery words in it

In [None]:
# Create a new DataFrame with only the mystery words
mystery_words_df = df[['Mystery Word']].dropna().reset_index(drop=True)

# Save the mystery words DataFrame to a new CSV file
mystery_words_df.to_csv('Game_data_mystery_words.csv', index=False)

# Display the first few rows of the mystery words DataFrame
print(mystery_words_df.head())

  Mystery Word
0        Stamp
1         Tile
2          Rat
3         Bill
4         Ring


In [None]:
import pandas as pd

def count_unique_numbers_in_first_column(data):
    # Assuming 'data' is a DataFrame
    first_column = data.iloc[:, 0]  # Select the first column
    unique_numbers = first_column.nunique()  # Count unique numbers
    return unique_numbers

# Example usage:
# Load your dataset into a DataFrame
data = pd.read_csv('Game_data_05_08_24.csv')
unique_count = count_unique_numbers_in_first_column(data)
print(f"The number of unique numbers in the first column is: {unique_count}")

The number of unique numbers in the first column is: 6637


23.08.2024: this piece of code was used to check whether there were still " and . left in the clues column, since they seemed to make some trouble for the algorithm. Since the result came out to be 0, this code was taken out of the main file.

In [None]:
df_cleaned_with_cleaned_clues = pd.read_csv("cleaned_data_with_cleaned_clues.csv")

# Define a function to search for the quotation mark character (") in the cleaned clues after splitting by commas
def count_quotation_marks(dataframe, column_name):
    # Initialize a counter for clues with quotation marks
    quotation_mark_count = 0
    
    # Loop through the specified column
    for clues_string in dataframe[column_name]:
        # Split the clues by commas
        clues_list = str(clues_string).split(',')
        # Check each individual clue for quotation marks
        for clue in clues_list:
            if '"' in clue:
                quotation_mark_count += 1
    
    return quotation_mark_count

def count_periods(dataframe, column_name):
    # Initialize a counter for clues with quotation marks
    period_count = 0
    
    # Loop through the specified column
    for clues_string in dataframe[column_name]:
        # Split the clues by commas
        clues_list = str(clues_string).split(',')
        # Check each individual clue for quotation marks
        for clue in clues_list:
            if '.' in clue:
                period_count+= 1
    
    return period_count

# Count the number of clues with quotation marks in the 'Clues_Cleaned' column after splitting
period_count = count_periods(df_cleaned_with_cleaned_clues, 'Clues')
quotation_mark_count =count_quotation_marks(df_cleaned_with_cleaned_clues, 'Clues')

# Print out the result
print(f"Number of individual clues with quotation marks in the cleaned dataset: {period_count}")
print(f"Number of individual clues with quotation marks in the cleaned dataset: {quotation_mark_count}")