In [1]:
# import libraries
import os
import numpy as np
import pandas as pd
import urllib.parse

### Creating the Dataframe for the LMQL-Pipeline

In [8]:
data_dir = os.path.join('..', 'dependencies', 'wikispeedia_paths-and-graph')

In [35]:
# decode url text inside page names
def decode_list(url_encoded_list):
    """Decodes url_encoded path.
    
    Args:
        url_encoded_list (list): list of url_encoded page names
    
    Returns:
        list: list of decoded page names
    """
    return [urllib.parse.unquote(element).replace(',', '') for element in url_encoded_list]

# pull in finished games and prepare for append
paths_finished = pd.read_csv(os.path.join(data_dir, 'paths_finished.tsv'), sep='\t', skiprows = 16, header = None, names = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating'])
paths_finished['path_decoded'] = paths_finished['path'].str.split(';').apply(decode_list)
paths_finished['origin'] = paths_finished['path_decoded'].str[0]
paths_finished['goal'] = paths_finished['path_decoded'].str[-1]
paths_finished['finished'] = 1

# pull in unfinished games and prepare for append
paths_unfinished = pd.read_csv(os.path.join(data_dir, 'paths_unfinished.tsv'), sep='\t', skiprows=17, header = None, names = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type'])
paths_unfinished['path_decoded'] = paths_unfinished['path'].str.split(';').apply(decode_list)
paths_unfinished['origin'] = paths_unfinished['path_decoded'].str[0]
paths_unfinished = paths_unfinished.rename(columns={'target': 'goal'})
paths_unfinished['finished'] = 0

# append
all_paths = pd.concat([paths_finished, paths_unfinished])
all_paths['game_pair'] = all_paths['origin'] + "->" + all_paths['goal']
game_pair_counts = all_paths['game_pair'].value_counts().rename_axis('game_pair').reset_index(name='count')
game_pair_counts = game_pair_counts.rename(columns={'count': 'human_attempts'})

# compute how many times the human players played each game pair. Select only those with 20 or more attemps 
game_pair_cands = game_pair_counts.query('human_attempts >= 20')
print('Selecting the games with 20 or more human attempts, we are left with {} game pairs.'.format(game_pair_cands.shape[0]))
game_pair_cands.head()

Selecting the games with 20 or more human attempts, we are left with 75 game pairs.


Unnamed: 0,game_pair,human_attempts
0,Brain->Telephone,2044
1,Theatre->Zebra,1777
2,Asteroid->Viking,1770
3,Pyramid->Bean,1586
4,Batman->Wood,223


In [24]:
def all_to_candidates(
    all_paths: pd.DataFrame,
    candidates: pd.DataFrame,
    new_var_pre_agg: str,
    new_var_post_agg: str,
    finished_only: bool
) -> pd.DataFrame:    
    """For mapping all_path data to candidate set
    
    Args:
        all_paths (pd.DataFrame): all paths data
        candidates (pd.DataFrame): candidate set
        new_var_pre_agg (str): variable to aggregate
        new_var_post_agg (str): name of new variable
        finished_only (bool): whether to aggregate only finished games
    Returns:
        pd.DataFrame: candidates with new variable
    """
    if finished_only == True:
        all_pairs_avg = all_paths[all_paths['finished'] == 1].groupby('game_pair')[new_var_pre_agg].agg('mean').reset_index().copy()
    else:
        all_pairs_avg = all_paths.groupby('game_pair')[new_var_pre_agg].agg('mean').reset_index().copy()
    all_pairs_avg = all_pairs_avg.rename(columns={new_var_pre_agg:new_var_post_agg})
    return candidates.merge(right=all_pairs_avg, on='game_pair').copy()

In [27]:
## Average rounds-to-completion
all_paths['game_length'] = all_paths['path_decoded'].apply(len)
all_paths.loc[all_paths['finished'] == 0, 'game_length'] = np.nan
game_pair_cands_pipeline = all_to_candidates(all_paths, game_pair_cands, 'game_length', 'avg_game_length', True)

game_pair_cands_pipeline[['origin', 'target']] = game_pair_cands_pipeline['game_pair'].str.split('->', expand=True)


In [28]:
game_pair_cands_pipeline

Unnamed: 0,game_pair,human_attempts,avg_game_length,origin,target
0,Brain->Telephone,2044,7.100000,Brain,Telephone
1,Theatre->Zebra,1777,7.836464,Theatre,Zebra
2,Asteroid->Viking,1770,7.516779,Asteroid,Viking
3,Pyramid->Bean,1586,8.246106,Pyramid,Bean
4,Batman->Wood,223,7.263514,Batman,Wood
...,...,...,...,...,...
70,Computer->Whale,20,7.214286,Computer,Whale
71,Colombia->Meat,20,8.750000,Colombia,Meat
72,Dog->Beer,20,6.000000,Dog,Beer
73,Batman->Bill_Clinton,20,5.909091,Batman,Bill_Clinton


In [29]:
game_pair_cands_pipeline.to_csv('pipeline_dataset.csv', columns=['game_pair', 'origin', 'target'], index=False)

# 2. OBTAIN MISTRAL 7B DATA

The following was run on `NVIDIA® V100 GPU` for ~ 20 hours. 

Make sure to have the `LMQL` environment activated. 

In [None]:
!python lmql_pipeline.py 

**NOTE THAT**: the results are contained in a big nested dictionary of the following structure:

```python
{
    'game_pair0': {
        rep0 = [steps], rep1 = [steps], ..., rep9 = [steps]
    },

    'game_pair1': {
        rep0 = [steps], rep1 = [steps], ..., rep9 = [steps]
    },

    ...

    'game_pair74': {
        rep0 = [steps], rep1 = [steps], ..., rep9 = [steps]
    },
    
}
```


## 3. RESULTS OF THE PIPELINE

In [83]:
import pickle

with open('Mistral_Games.pickle', 'rb') as file:
    Mistral_Games = pickle.load(file)


In [84]:
l = list(game_pair_cands_pipeline.game_pair)
np.where(l == 'Electricity->Anne_Frank')

  np.where(l == 'Electricity->Anne_Frank')


(array([], dtype=int64),)

In [91]:
# make the keys of the dictionary follow the same order as the list above, and remove the keys that do not appear in the list
Mistral_Games = {k: Mistral_Games[k] for k in list(l) if k in Mistral_Games}

**NOTE THAT**: the results are contained in a big nested dictionary of the following structure:

{   'game_pair0': {
        rep0 = [steps], rep1 = [steps], ..., rep9 = [steps]
    },

    game_pair1': {
        rep0 = [steps], rep1 = [steps], ..., rep9 = [steps]
    },

    ...

    game_pair49': {
        rep0 = [steps], rep1 = [steps], ..., rep9 = [steps]
    },
    
}

In [96]:
data = []

for game_pair, reps in Mistral_Games.items():
    for rep, values in reps.items():
        data.append([game_pair, rep] + values)

df_results = pd.DataFrame(data)
df_results.columns = ['Game Pair', 'Repetition'] + [f'Value_{i}' for i in range(df_results.shape[1] - 2)]

df_results.head()

Unnamed: 0,Game Pair,Repetition,Value_0,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,...,Value_10,Value_11,Value_12,Value_13,Value_14,Value_15,Value_16,Value_17,Value_18,Value_19
0,Brain->Telephone,rep0,Computer_science,Computer_programming,Microsoft_Windows,Windows_XP,Internet_Explorer,Microsoft_Windows,Windows_XP,Internet_Explorer,...,Radio,Television,Telecommunication,Telephone,,,,,,
1,Brain->Telephone,rep1,Computer_science,Computer_programming,Microsoft_Windows,GNU,Internet,Information,Mass_media,Telephone,...,,,,,,,,,,
2,Brain->Telephone,rep2,Computer_science,Information,Communication,Telephone,,,,,...,,,,,,,,,,
3,Brain->Telephone,rep3,Computer_science,Information,Communication,Telephone,,,,,...,,,,,,,,,,
4,Brain->Telephone,rep4,Computer_science,Information,Communication,Telephone,,,,,...,,,,,,,,,,
