In [1]:
!pip install --upgrade -q gspread
!pip install -U -q PyDrive


import gspread
import pandas as pd
from google.colab import auth
auth.authenticate_user()
from google.auth import default

creds, _ = default()
gc = gspread.authorize(creds)

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
from google.colab import drive as gdrive

gdrive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
y_test_fname = '/content/gdrive/MyDrive/ML práctico 2022/2022-2C/y_test.jl'

with open(y_test_fname) as f:
  y_test = list(map(float, f.read().split('\n')))

In [5]:
from datetime import datetime
import numpy as np
from collections import Counter
from time import sleep
from sklearn.metrics import mean_squared_error, mean_absolute_error, \
      mean_absolute_percentage_error
from IPython.display import clear_output, display
import pytz

tz = pytz.timezone('America/Buenos_Aires')

ARS_indices = set([0, 1261, 2366, 2755, 3024, 3028, 3030, 3033, 3040, 3045, 3518,
               5147, 5197, 5214, 7265, 7266, 10538, 11591, 12277, 12278, 13796,
               15911, 16488, 17254, 17261, 17520] )

class Evaluator:
  _cached_scores = {}

  def get_submissions(self):
    worksheet = gc.open('Taller de model selection (Responses)')
    rows = worksheet.worksheets()[0].get_all_values()
    header = rows[0]
    rows = [dict(zip(header, row)) for row in rows[1:]]
    return rows
    
  def get_competition_table(self, group_by_team=True):
    rows = self.get_submissions()

    team_submissions = Counter()
    team_best = {}
    all_submissions = []
    for row in rows:
      row['team_name'] = team_name = row.pop('Nombre equipo')
      team_submissions[team_name] += 1
      
      id = row['Submission'].split('=')[-1]
      result = self.score_solution(id)
      if result['status'] == 'error': 
        print(f'Failed to process solution from team "{team_name}" ({row["Comment"]}). \n\t' + result['reason'] )
        continue

      result.update(row)
      all_submissions.append(result)
      if group_by_team:
        if team_name not in team_best:
          team_best[team_name] = result
        elif result['log(rmse)'] < team_best[team_name]['log(rmse)']:
          team_best[team_name] = result

    docs = []
    if group_by_team:
      for team_name, best_result in team_best.items():
        doc = {}
        doc.update(best_result)
        doc['team_name'] = team_name
        doc['submissions'] = team_submissions[team_name]
        docs.append(doc)
      df = pd.DataFrame(docs).sort_values('log(rmse)')
    else:
      for doc in all_submissions:
        doc['submissions'] = team_submissions[team_name]
        docs.append(doc)
      df = pd.DataFrame(docs)

    now = datetime.now(tz=tz)

    secs_ago = lambda t: pd.Timedelta(
        now - tz.localize(datetime.strptime(t, '%m/%d/%Y %H:%M:%S'))
        ).round('S')

    df['secs ago'] = df.Timestamp.apply(secs_ago)
    return df[['team_name', 'log(rmse)', 'secs ago', 'submissions', 
              'log(mae)', 'mape', 'Comment']]

  def score_solution(self, id):
      if id in self._cached_scores: return self._cached_scores[id]
    
      downloaded = drive.CreateFile({'id':id})   # replace the id with id of file you want to access
      downloaded.GetContentFile('tmp_submission.csv') 
      with open('tmp_submission.csv') as f:
        y_pred = list(map(float, f.read().split()))
      
      if len(y_pred) != len(y_test): 
        # assume v1 data submission
        orig_len = len(y_pred)
        y_pred = [e for i, e in enumerate(y_pred) if i not in ARS_indices]
        # if it's still different, there was an error
        if len(y_pred) != len(y_test): 
          res = self._cached_scores[id] = {
              'status': 'error', 'reason': f'Inconsistent lengths. Expected {len(y_test)}, received {orig_len}'
          }
          return res
          
      self._cached_scores[id] = {
          'status': 'ok',
          'log(rmse)': np.log10(mean_squared_error(y_test, y_pred) ** 0.5),
          'log(mae)': np.log10(mean_absolute_error(y_test, y_pred)),
          'mape': mean_absolute_percentage_error(y_test, y_pred),
      }

      return self._cached_scores[id]

In [6]:
ev = Evaluator()

In [13]:
df = ev.get_competition_table(group_by_team=False)

rows = df.to_dict(orient='records')
for r in rows:
  r['secs ago'] = str(r['secs ago'])
columns = ['team_name', 'log(rmse)', 'secs ago', 'submissions', 'log(mae)', 'mape', 'Comment']
rows = [columns] + [[r[c] for c in columns] for r in rows]
s = gc.open('Leaderboard Taller Model Selection').get_worksheet(1)
s.update('A1', rows)


df = ev.get_competition_table(group_by_team=True)

rows = df.to_dict(orient='records')
for r in rows:
  r['secs ago'] = str(r['secs ago'])
rows = (
    [['Ultima actualizacion', datetime.now(tz=tz).strftime('%d/%m %H:%M:%S')]] +
    [columns] + 
    [[r[c] for c in columns] for r in rows]
)
s = gc.open('Leaderboard Taller Model Selection').get_worksheet(0)
s.update('A1', rows)

Failed to process solution from team "LaPlebe" (algunos hypers). 
	Inconsistent lengths. Expected 18417, received 18121
Failed to process solution from team "LaPlebe" (aber). 
	Inconsistent lengths. Expected 18417, received 18121
Failed to process solution from team "LaPlebe" (algunos hypers). 
	Inconsistent lengths. Expected 18417, received 18121
Failed to process solution from team "LaPlebe" (aber). 
	Inconsistent lengths. Expected 18417, received 18121


{'spreadsheetId': '1ywBjLMHjrlL2vla-maO7ZWxNzpsWxq7kP0UohzAUHZw',
 'updatedRange': "'best by team'!A1:G16",
 'updatedRows': 16,
 'updatedColumns': 7,
 'updatedCells': 107}

In [None]:
from collections import Counter
from time import sleep, time
from sklearn.metrics import mean_squared_error
from IPython.display import clear_output, display, display_markdown

pd.options.display.float_format = lambda x :f'{x:.02f}'

df = None
last_print = 0
while True:
  prev_df = df
  df = ev.get_competition_table()
  changed = (
      prev_df is None or 
      len(df) != len(prev_df) or 
      (df['log(rmse)'] != prev_df['log(rmse)']).any()
  )
  if changed or time() - last_print > 60: 
    clear_output()
    display_markdown(f'**Fecha**: {datetime.now(tz).strftime("%d/%m/%Y")}', raw=True)
    display_markdown(f'**Hora**: {datetime.now(tz).strftime("%H:%M:%S")}', raw=True)
    display(df)
    last_print = time()
  sleep(10)

**Fecha**: 20/10/2022

**Hora**: 14:19:38

Unnamed: 0,team_name,log(rmse),secs ago,submissions,log(mae),mape,Comment
2,LaPlebe,5.29,1 days 03:41:22,17,4.84,0.34,prueba estimators
3,El Arca,5.3,1 days 02:26:13,5,4.85,0.4,"teniendo en cuenta zona, m2 de zona, lgb de 500"
6,miku fanclub [oficial],5.3,1 days 01:26:09,5,4.86,0.41,Pipelines Baños + descriptor features (+hab+su...
10,P̴̢̨͠ȟ̸͆ͅ'̷̡̦̍̂ṅ̸̲͎̈ḡ̵̜l̶̛͚̤̒ừ̵ͅi̴͇̎̕ ̷͋͠...,5.32,1 days 01:14:43,4,4.86,0.41,"lgbm con tipo propiedad, transporte y profesio..."
9,Nito2,5.32,1 days 02:32:21,4,4.9,0.44,v1 con LGMB
12,fmcurti,5.32,0 days 19:58:43,1,4.88,0.44,Ensemble 2 mejores modelos + fasttext + subtes
5,Placeholder,5.33,1 days 01:33:53,3,4.88,0.44,version 3 - barrio null = indef.
4,,5.33,6 days 16:03:19,1,4.89,0.42,Data de estaciones de subte cercanas
8,Nito,5.34,1 days 03:03:10,5,4.9,0.5,Test
11,The kickstarters,5.34,1 days 01:23:05,1,4.89,0.44,"Catboost, usando fastext, y onehotencoder para..."


Failed to process solution from team "LaPlebe" (algunos hypers). 
	Inconsistent lengths. Expected 18417, received 18121
Failed to process solution from team "LaPlebe" (aber). 
	Inconsistent lengths. Expected 18417, received 18121
Failed to process solution from team "LaPlebe" (algunos hypers). 
	Inconsistent lengths. Expected 18417, received 18121
Failed to process solution from team "LaPlebe" (aber). 
	Inconsistent lengths. Expected 18417, received 18121


KeyboardInterrupt: ignored