In this notebook, I will put several pieces of code that didn't end up in the final code for my thesis. In some cases, there is extensive documentation for the code as I intended to leave it in the main file, but in other cases, especially if the code was tossed quickly, the documentation is not as extensive.

In [1]:
import pandas as pd
import statsmodels.api as sm

# Assume df is your dataframe
# Create a dummy variable for missing 'Mode' values
df['Mode_Missing'] = df['Mode'].isna().astype(int)

# Convert 'Mode' to dummy variables, keeping NaN as is
df_with_dummies = pd.get_dummies(df[['Mode', 'Speed', 'Language', 'Mode_Missing']], columns=['Mode', 'Language'], drop_first=True)

# Prepare the predictors (X) and the response variable (y)
X = df_with_dummies
y = df['Success']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the logistic regression model
model = sm.Logit(y, X).fit()

# Print the summary of the regression
print(model.summary())

NameError: name 'df' is not defined

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from langdetect import detect

# Load the dataset
df = pd.read_csv('/mnt/data/Game_data_25_04_24.csv')

# Create a dummy variable for missing 'Mode' values
df['Mode_Missing'] = df['Mode'].isna().astype(int)

# Convert 'Mode' and 'Language' to dummy variables, including NaN as a separate category
df = pd.get_dummies(df, columns=['Mode', 'Language'], drop_first=True, dummy_na=True)

# Ensure all columns are numeric
df = df.apply(pd.to_numeric, errors='coerce')

# Calculate number of unique clues
df['Num_Clues'] = df['Clues'].apply(lambda x: len(x.split(',')) if pd.notna(x) else 0)

# Detect the language of the guess and the mystery word, and compare
def detect_language(row):
    try:
        guess_language = detect(row['Guess']) if pd.notna(row['Guess']) else 'unknown'
        mystery_word_language = detect(row['Mystery Word']) if pd.notna(row['Mystery Word']) else 'unknown'
        return 1 if guess_language == mystery_word_language else 0
    except:
        return 0

df['Language_Match'] = df.apply(detect_language, axis=1)

# Define the dependent variable and independent variables
X = df[['Num_Clues', 'Speed', 'Language_Match', 'Mode_Missing'] + [col for col in df.columns if 'Mode_' in col or 'Language_' in col]]
y = df['Success']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Check for any remaining non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns found: {non_numeric_columns}")
    X[non_numeric_columns] = X[non_numeric_columns].apply(pd.to_numeric, errors='coerce')

# Ensure there are no missing values
X = X.fillna(0)

# Fit the logistic regression model using statsmodels
logit_model = sm.Logit(y, X).fit()

# Print the model summary
print(logit_model.summary())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and fit the logistic regression model using scikit-learn
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("fr_FR")
d_de = enchant.Dict("de_DE")

# Function to detect the language of a single word using pyenchant
def detect_language_single_word(word):
    word = word.strip().lower()
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check + fr_check + de_check > 1:
        return 'multiple'
    elif en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

# Function to detect the most common language of clues using a weighted voting system for multiple words
def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown' and lang != 'multiple':
            lang_counter[lang] += 1
    
    if lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

# Function to map full language names to short codes
def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Function to compare the clues language to the mystery word/guess language
def compare_languages(row):
    mystery_lang = map_language(row['Mystery_Word_Language'])
    guess_lang = map_language(row['Guess_Language'])
    clues_lang = map_language(row['Clues_Language'])
    
    if clues_lang == 'multiple' or mystery_lang == 'multiple' or guess_lang == 'multiple':
        if (mystery_lang in ['en', 'fr', 'de'] and guess_lang in ['en', 'fr', 'de'] and clues_lang in ['en', 'fr', 'de']):
            return 'yes'
        if 'multiple' in [mystery_lang, guess_lang, clues_lang]:
            matched_languages = row['Matching_Languages'].split(', ')
            if any(lang in matched_languages for lang in ['en', 'fr', 'de']):
                return 'yes'
        return 'no'
    elif clues_lang == mystery_lang == guess_lang:
        return 'yes'
    else:
        return 'no'

# Function to determine which languages matched in case of 'multiple'
def determine_matching_languages(matched_languages):
    return ', '.join(matched_languages) if matched_languages else 'N/A'

# Load the dataset
df = pd.read_csv('/mnt/data/Game_data_25_04_24.csv')

# Detect the language of the mystery word, clues, and guesses only for rows with NaN in 'Language'
df['Mystery_Word_Language'] = df.apply(lambda row: detect_language_single_word(row['Mystery Word']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Guess_Language'] = df.apply(lambda row: detect_language_single_word(row['Guess']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Clues_Language'] = df.apply(lambda row: 'unknown' if pd.isna(row['Clues']) else detect_language_clues(row['Clues']), axis=1)

# Drop rows with 'unknown' languages
df = df[(df['Mystery_Word_Language'] != 'unknown') & (df['Guess_Language'] != 'unknown') & (df['Clues_Language'] != 'unknown')]

# Apply the comparison function only to rows with NaN in 'Language'
df['Language_Match'] = df.apply(lambda row: compare_languages(row) if pd.isna(row['Language']) else 'N/A', axis=1)

# Convert 'Matched_Languages_Set' from set to string
df['Matching_Languages'] = df.apply(lambda row: determine_matching_languages(set([row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']])) if 'multiple' in [row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']] else 'N/A', axis=1)

# Save the cleaned dataset with the new 'Language_Match' and 'Matching_Languages' columns
df.to_csv('/mnt/data/Game_data_cleaned.csv', index=False)

# Display the first few rows of the updated dataframe
df.head()

Clues cannot be declared as multiple, code below fixes that

In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("fr_FR")
d_de = enchant.Dict("de_DE")

# Function to detect the language of a single word using pyenchant
def detect_language_single_word(word):
    word = word.strip().lower()
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check + fr_check + de_check > 1:
        return 'multiple'
    elif en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

# Function to detect the most common language of clues using a weighted voting system for multiple words
def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown':
            lang_counter[lang] += 1
    
    if len(lang_counter) > 1:
        return 'multiple'
    elif lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

# Function to map full language names to short codes
def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Function to compare the clues language to the mystery word/guess language
def compare_languages(row):
    mystery_lang = map_language(row['Mystery_Word_Language'])
    guess_lang = map_language(row['Guess_Language'])
    clues_lang = map_language(row['Clues_Language'])
    
    if clues_lang == 'multiple' or mystery_lang == 'multiple' or guess_lang == 'multiple':
        if (mystery_lang in ['en', 'fr', 'de'] and guess_lang in ['en', 'fr', 'de'] and clues_lang in ['en', 'fr', 'de']):
            return 'yes'
        if 'multiple' in [mystery_lang, guess_lang, clues_lang]:
            matched_languages = row['Matching_Languages'].split(', ')
            if any(lang in matched_languages for lang in ['en', 'fr', 'de']):
                return 'yes'
        return 'no'
    elif clues_lang == mystery_lang == guess_lang:
        return 'yes'
    else:
        return 'no'

# Function to determine which languages matched in case of 'multiple'
def determine_matching_languages(matched_languages):
    return ', '.join(matched_languages) if matched_languages else 'N/A'

# Load the dataset
df = pd.read_csv('/mnt/data/Game_data_25_04_24.csv')

# Detect the language of the mystery word, clues, and guesses only for rows with NaN in 'Language'
df['Mystery_Word_Language'] = df.apply(lambda row: detect_language_single_word(row['Mystery Word']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Guess_Language'] = df.apply(lambda row: detect_language_single_word(row['Guess']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Clues_Language'] = df.apply(lambda row: 'unknown' if pd.isna(row['Clues']) else detect_language_clues(row['Clues']), axis=1)

# Drop rows with 'unknown' languages
df = df[(df['Mystery_Word_Language'] != 'unknown') & (df['Guess_Language'] != 'unknown') & (df['Clues_Language'] != 'unknown')]

# Apply the comparison function only to rows with NaN in 'Language'
df['Language_Match'] = df.apply(lambda row: compare_languages(row) if pd.isna(row['Language']) else 'N/A', axis=1)

# Convert 'Matched_Languages_Set' from set to string
df['Matching_Languages'] = df.apply(lambda row: determine_matching_languages(set([row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']])) if 'multiple' in [row['Mystery_Word_Language'], row['Guess_Language'], row['Clues_Language']] else 'N/A', axis=1)

# Save the cleaned dataset with the new 'Language_Match' and 'Matching_Languages' columns
df.to_csv('/mnt/data/Game_data_cleaned.csv', index=False)

# Display the first few rows of the updated dataframe
df.head()

### Documentation (as of August 2, 2024)

1. **Initialize Dictionaries**:
   - The script initializes dictionaries for English, French, and German using the `pyenchant` library.

2. **Function: `detect_language_single_word`**:
   - This function checks if a word exists in any of the three dictionaries.
   - It returns `'multiple'` if the word is found in two or more dictionaries.
   - It returns `'en'`, `'fr'`, or `'de'` if the word is found in the English, French, or German dictionary, respectively.
   - It returns `'unknown'` if the word is not found in any dictionary.

3. **Function: `detect_language_clues`**:
   - This function detects the most common language of the clues using a weighted voting system.
   - It splits the clues into individual words and checks each word using `detect_language_single_word`.
   - It returns `'multiple'` if words from more than one language are found among the clues.
   - It returns the most common language if all words are from one language.
   - It returns `'unknown'` if no valid words are found.

4. **Function: `map_language`**:
   - This function maps full language names to their short codes.
   - It converts 'English' to 'en', 'Français' to 'fr', and 'Deutsch' to 'de'.

5. **Function: `compare_languages`**:
   - This function compares the languages of the mystery word, guess, and clues.
   - It returns `'yes'` if the languages match or if they are valid combinations involving `'multiple'`.
   - It returns `'no'` if the languages do not match.

6. **Function: `determine_matching_languages`**:
   - This function determines which languages matched in the case of `'multiple'`.
   - It returns a comma-separated list of the matched languages or 'N/A' if no languages matched.

7. **Language Detection**:
   - The script detects the language of the mystery word, guess, and clues only for rows with NaN in 'Language'.
   - The languages are mapped using `map_language` if the 'Language' column is not NaN.

8. **Filter Rows**:
   - Rows with 'unknown' languages are dropped from the dataset.

9. **Determine Matching Languages**:
   - The `determine_matching_languages` function is applied to all rows to create the 'Matching_Languages' column.

10. **Compare Languages**:
    - The `compare_languages` function is applied to all rows to create the 'Language_Match' column.

11. **Statistics**:
    - The script prints the number of rows where 'Language_Match' is set to 'no'.

12. **Save the Dataset**:
    - The cleaned dataset, including the new 'Language_Match' and 'Matching_Languages' columns, is saved to `Game_data_cleaned.csv`.

13. **Display Valid Rounds**:
    - The first few rows of the valid rounds dataframe are displayed for verification.


In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")

# Function to detect the language of a single word using pyenchant
def detect_language_single_word(word):
    word = word.strip().lower()
    en_check = d_en.check(word)
    fr_check = d_fr.check(word)
    de_check = d_de.check(word)
    
    # Determine the language of the word
    if en_check + fr_check + de_check >= 2:
        return 'multiple'
    elif en_check:
        return 'en'
    elif fr_check:
        return 'fr'
    elif de_check:
        return 'de'
    else:
        return 'unknown'

# Function to detect the most common language of clues using a weighted voting system for multiple words
def detect_language_clues(text):
    words = text.lower().split(',')
    lang_counter = Counter()
    
    for word in words:
        lang = detect_language_single_word(word.strip())
        if lang != 'unknown':
            lang_counter[lang] += 1
    
    if len(lang_counter) > 1:
        return 'multiple'
    elif lang_counter:
        most_common_lang, count = lang_counter.most_common(1)[0]
        return most_common_lang
    else:
        return 'unknown'

# Function to map full language names to short codes
def map_language(language):
    if language == 'English':
        return 'en'
    elif language == 'Français':
        return 'fr'
    elif language == 'Deutsch':
        return 'de'
    return language

# Function to compare the clues language to the mystery word/guess language
def compare_languages(row):
    mystery_lang = map_language(row['Mystery_Word_Language'])
    guess_lang = map_language(row['Guess_Language'])
    clues_lang = map_language(row['Clues_Language'])
    
    if clues_lang == 'multiple' or mystery_lang == 'multiple' or guess_lang == 'multiple':
        if (mystery_lang in ['en', 'fr', 'de'] and guess_lang in ['en', 'fr', 'de'] and clues_lang in ['en', 'fr', 'de']):
            return 'yes'
        if 'multiple' in [mystery_lang, guess_lang, clues_lang]:
            matched_languages = row['Matching_Languages'].split(', ')
            if any(lang in matched_languages for lang in ['en', 'fr', 'de']):
                return 'yes'
        return 'no'
    elif clues_lang == mystery_lang == guess_lang:
        return 'yes'
    else:
        return 'no'

# Function to determine which languages matched in case of 'multiple'
def determine_matching_languages(row):
    matched_languages = set()
    if row['Mystery_Word_Language'] == 'multiple':
        matched_languages.update(['en', 'fr', 'de'])
    if row['Guess_Language'] == 'multiple':
        matched_languages.update(['en', 'fr', 'de'])
    if row['Clues_Language'] == 'multiple':
        matched_languages.update(['en', 'fr', 'de'])
    return ', '.join(matched_languages) if matched_languages else 'N/A'

# Detect the language of the mystery word, clues, and guesses only for rows with NaN in 'Language'
df['Mystery_Word_Language'] = df.apply(lambda row: detect_language_single_word(row['Mystery Word']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Guess_Language'] = df.apply(lambda row: detect_language_single_word(row['Guess']) if pd.isna(row['Language']) else map_language(row['Language']), axis=1)
df['Clues_Language'] = df.apply(lambda row: 'unknown' if pd.isna(row['Clues']) else detect_language_clues(row['Clues']), axis=1)

# Drop rows with 'unknown' languages
df = df[(df['Mystery_Word_Language'] != 'unknown') & (df['Guess_Language'] != 'unknown') & (df['Clues_Language'] != 'unknown')]

# Determine matching languages for all rows
df['Matching_Languages'] = df.apply(determine_matching_languages, axis=1)

# Apply the comparison function to all rows
df['Language_Match'] = df.apply(compare_languages, axis=1)

# Check how many times 'Language_Match' is set to 'no'
num_no_matches = df['Language_Match'].value_counts().get('no', 0)
print(f"Number of rows where 'Language_Match' is set to 'no': {num_no_matches}")

# Save the cleaned dataset with the new 'Language_Match' and 'Matching_Languages' columns
df.to_csv('/mnt/data/Game_data_cleaned.csv', index=False)

# Display the first few rows of the updated dataframe
df.head()

Thrown out: 02.08.2024

In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")

# Global counter for invalid words
invalid_word_count = 0

def word_in_dictionaries(word):
    word = word.strip().lower()
    return d_en.check(word) or d_fr.check(word) or d_de.check(word)

def validate_round(row):
    global invalid_word_count
    
    mystery_word_valid = word_in_dictionaries(row['Mystery Word'])
    if not mystery_word_valid:
        invalid_word_count += 1

    guess_valid = word_in_dictionaries(row['Guess'])
    if not guess_valid:
        invalid_word_count += 1

    clues_valid = True
    for word in row['Clues'].split(','):
        if not word_in_dictionaries(word):
            invalid_word_count += 1
            clues_valid = False

    return mystery_word_valid and guess_valid and clues_valid

# Validate each round
df['Valid_Round'] = df.apply(validate_round, axis=1)

# Split the dataset into valid and rejected rounds
valid_rounds = df[df['Valid_Round']]
rejected_rounds = df[~df['Valid_Round']]

# Drop the helper column
valid_rounds = valid_rounds.drop(columns=['Valid_Round'])
rejected_rounds = rejected_rounds.drop(columns=['Valid_Round'])

# Save the datasets
valid_rounds.to_csv('/mnt/data/Game_data_valid.csv', index=False)
rejected_rounds.to_csv('/mnt/data/rejected_rounds.csv', index=False)

# Display the first few rows of the valid rounds dataframe
valid_rounds.head()

# Calculate the number of words rejected
num_rejected_words = (
    rejected_rounds['Mystery Word'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Guess'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Clues'].apply(lambda x: len(x.split(',')))
).sum()

# Printout of the number of rounds and words tossed out
num_rejected_rounds = len(rejected_rounds)
print(f"Number of rounds rejected: {num_rejected_rounds}")
print(f"Number of words rejected: {num_rejected_words}")
print(f"Number of invalid words: {invalid_word_count}")



In [None]:
import pandas as pd
import enchant
from collections import Counter

# Initialize dictionaries using pyenchant
d_en = enchant.Dict("en_US")
d_fr = enchant.Dict("French")
d_de = enchant.Dict("German_de_DE")# Global counter for invalid words

invalid_word_count = 0

def word_in_dictionaries(word):
    word = word.strip().lower().replace('"', '').replace('-', '')
    if word == "":
        return False
    return d_en.check(word) or d_fr.check(word) or d_de.check(word)

def validate_round(row):
    global invalid_word_count
    
    mystery_word_valid = word_in_dictionaries(row['Mystery Word'])
    if not mystery_word_valid:
        invalid_word_count += 1

    guess_valid = word_in_dictionaries(row['Guess'])
    if not guess_valid:
        invalid_word_count += 1

    clues_valid = True
    for word in row['Clues'].split(','):
        if not word_in_dictionaries(word):
            invalid_word_count += 1
            clues_valid = False

    return mystery_word_valid and guess_valid and clues_valid

# Validate each round
df['Valid_Round'] = df.apply(validate_round, axis=1)

# Split the dataset into valid and rejected rounds
valid_rounds = df[df['Valid_Round']]
rejected_rounds = df[~df['Valid_Round']]

# Drop the helper column
valid_rounds = valid_rounds.drop(columns=['Valid_Round'])
rejected_rounds = rejected_rounds.drop(columns=['Valid_Round'])

# Save the datasets
valid_rounds.to_csv('/mnt/data/Game_data_valid.csv', index=False)
rejected_rounds.to_csv('/mnt/data/rejected_rounds.csv', index=False)

# Display the first few rows of the valid rounds dataframe
valid_rounds.head()

# Calculate the number of words rejected
num_rejected_words = (
    rejected_rounds['Mystery Word'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Guess'].apply(lambda x: len(x.split(','))) +
    rejected_rounds['Clues'].apply(lambda x: len(x.split(',')))
).sum()

# Printout of the number of rounds and words tossed out
num_rejected_rounds = len(rejected_rounds)
print(f"Number of rounds rejected: {num_rejected_rounds}")
print(f"Number of words rejected: {num_rejected_words}")
print(f"Number of invalid words: {invalid_word_count}")