<a href="https://colab.research.google.com/github/csnick93/sports_classification/blob/main/SportsClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Setup git lfs

In [None]:
!wget https://github.com/git-lfs/git-lfs/releases/download/v2.12.0/git-lfs-linux-amd64-v2.12.0.tar.gz

In [None]:
!tar -xvf  git-lfs-linux-amd64-v2.12.0.tar.gz


In [None]:
!./install.sh

# Getting the code repo

In [None]:
!git clone https://github.com/csnick93/sports_classification.git

In [None]:
!pip install mlflow

In [None]:
!pip install --upgrade fastai

# Adding mlflow functionality

In [None]:
import mlflow
from mlflow import log_metric, log_param, log_artifacts,log_artifact

In [None]:
mlflow.set_tracking_uri('sports_classification/mlruns')

In [None]:
def get_max_run_id(experiment_id):
    runs = mlflow.search_runs(experiment_ids=[experiment_id]) 
    run_id = len(runs)
    return run_id

def connect_to_experiment(experiment_name):
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        experiment_id = mlflow.create_experiment(experiment_name)
        run_id = 1 
    else:
        experiment_id = experiment.experiment_id 
        run_id = get_max_run_id(experiment_id) 
    return experiment_id, run_id

# Imports

In [None]:
from fastai.vision.all import *
from fastai.data.all import *
import pandas as pd
from tqdm import tqdm

# Experiment Configs

In [None]:
#config
class Config:
  def __init__(self,
              augmentations = False,
              img_size = 224,  
              num_epochs = 5,
              model_arch = resnet18):
      self.augmentations = augmentations
      self.img_size = img_size
      self.num_epochs = num_epochs
      self.model_arch = model_arch

  def __str__(self):
    return '%s_%i_%i_%s'%(str(self.augmentations), self.img_size, 
                          self.num_epochs, str(self.model_arch).split(' ')[1])
  
  def mlflow_config(self):
    return list(self.__dict__.items())
  
config = Config(augmentations=True, img_size=224, 
                num_epochs=10, model_arch=resnet34)

# Get Data and inspect

In [None]:
path = Path('sports_classification/data')
train_val_folder = get_image_files(path/"train")
train_val_data = pd.read_csv(path/'train_val_data.csv')

In [None]:
train_val_data.head()

In [None]:
if config.augmentations:
  data_block = DataBlock(blocks=(ImageBlock, CategoryBlock),
                        splitter=ColSplitter(),
                        get_x=ColReader(0, pref='sports_classification/data/'),
                        get_y=ColReader(1),
                        item_tfms=Resize(config.img_size),
                        batch_tfms=aug_transforms()
                        )
else:
  data_block = DataBlock(blocks=(ImageBlock, CategoryBlock),
                        splitter=ColSplitter(),
                        get_x=ColReader(0, pref='sports_classification/data/'),
                        get_y=ColReader(1),
                        item_tfms=Resize(config.img_size)
                        )

In [None]:
dls = data_block.dataloaders(train_val_data)

In [None]:
dls.show_batch()

# Start Training

In [None]:
learn = cnn_learner(dls, config.model_arch, metrics=error_rate, 
                    cbs = [SaveModelCallback(monitor='valid_loss', fname='best_model')])

In [None]:
learn.lr_find()

In [None]:
learn.fine_tune(config.num_epochs, 0.005)

In [None]:
learning_results = [('final_train_loss', learn.final_record[0]), 
                    ('final_val_loss', learn.final_record[1]),
                    ('final_error_rate', learn.final_record[2])]

# Inspect results

In [None]:
learn.show_results()

In [None]:
interp = Interpretation.from_learner(learn)

In [None]:
interp.plot_top_losses(9, figsize=(15,10))


In [None]:
class_interp = ClassificationInterpretation.from_learner(learn)
class_interp.plot_confusion_matrix(title='Confusion matrix', figsize=(10,10))

# Make prediction on test set

In [None]:
test_folder = get_image_files(path/"test")

In [None]:
model_path = 'models/' + str(config)+'.pkl'
learn.export(fname = model_path)

In [None]:
predictions = {'image': [], 'sports': []}
for image in tqdm(test_folder):
  pred = learn.predict(image)
  predictions['image'].append(str(image).replace('sports_classification/data','.'))
  predictions['sports'].append(pred[0])

In [None]:
pred_df = pd.DataFrame(predictions)

In [None]:
test_prediction_file = 'test_evaluation.csv'
pred_df.to_csv(test_prediction_file, index=False)

# Log the results

In [None]:
experiment_id, run_id = connect_to_experiment('sports_classification')
mlflow.start_run(run_name='sports_classification_run', experiment_id=experiment_id)

In [None]:
for config_tuple in config.mlflow_config():
  log_param(*config_tuple)

In [None]:
for result_tuple in learning_results:
  log_metric(*result_tuple)

In [None]:
log_artifact(model_path)
log_artifact(test_prediction_file)

In [None]:
mlflow.end_run()

In [None]:
!cd sports_classification && git pull --no-edit

In [None]:
!git config --global user.email "nickvonroden@gmail.com" && \
  git config --global user.name "Nicolas von Roden" && \
  cd sports_classification && \
  git add mlruns && \
  git commit -m 'updating mlflow' && \
  git push https://<usr>:<pwd>@github.com/<usr>/sports_classification.git --all