<a href="https://colab.research.google.com/github/csnick93/sports_classification/blob/main/SportsClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path
cloud_dir = Path('/content/drive/My Drive/SportsClassification')


In [None]:
!rsync --info=progress2 ./drive/My\ Drive/SportsClassification/data.zip . && unzip data.zip 

In [None]:
!rsync --info=progress2 ./drive/My\ Drive/SportsClassification/mlruns.zip . && unzip mlruns.zip

In [None]:
!mkdir ~/.kaggle
!rsync --info=progress2 ./drive/My\ Drive/SportsClassification/kaggle.json ~/.kaggle

In [None]:
data_dir = Path('/content/data')
mlflow_dir = Path('/content/mlruns')
assert(data_dir.exists())
assert(mlflow_dir.exists())

# Getting the code repo

In [None]:
!pip install --upgrade pip

In [None]:
!git clone https://github.com/csnick93/sports_classification.git

In [None]:
!pip install -q mlflow kaggle

In [None]:
!pip install fastai==2.0.16 nbdev

# Imports

In [None]:
from fastai.vision.all import *
from fastai.data.all import *
import pandas as pd
from tqdm.notebook import tqdm

# Experiment Configs

In [None]:
#config
class Config:
  def __init__(self,
              data_dir,
              augmentations = False,
              img_size = 224,  
              num_epochs = 5,
              num_freeze_epochs = 1,
              model_arch = resnet18,
              data_subset=False):
      self.data_dir = data_dir
      self.augmentations = augmentations
      self.img_size = img_size
      self.num_epochs = num_epochs
      self.num_freeze_epochs = num_freeze_epochs
      self.model_arch = model_arch
      self.data_subset = data_subset

      if self.data_subset: 
        self.train_val_file = self.data_dir/'subset_train_val_data.csv'
      else:
        self.train_val_file = self.data_dir/'train_val_data.csv'


  def __str__(self):
    return '%s_%i_%i_%s_%s'%(str(self.augmentations), self.img_size, 
                          self.num_epochs, str(self.model_arch).split(' ')[1],
                          self.data_subset)
  
  def mlflow_config(self):
    return list(self.__dict__.items())
  
config = Config(data_dir, augmentations=True, img_size=224, 
                num_epochs=10, num_freeze_epochs = 10,
                model_arch=resnet101, data_subset = False)

* For data subset:
  * ResNet18:
    * num_freeze_epochs: after epoch 8, starting to overfit

# Get Data and inspect

In [None]:
train_val_folder = get_image_files(config.data_dir/"train")
train_val_data = pd.read_csv(config.train_val_file)

In [None]:
if config.augmentations:
  data_block = DataBlock(blocks=(ImageBlock, CategoryBlock),
                        splitter=ColSplitter(),
                        get_x=ColReader(0, pref=config.data_dir),
                        get_y=ColReader(1),
                        item_tfms=Resize(2*config.img_size),
                        batch_tfms=aug_transforms(size=config.img_size, 
                                                  min_scale=0.75)
                        )
else:
  data_block = DataBlock(blocks=(ImageBlock, CategoryBlock),
                        splitter=ColSplitter(),
                        get_x=ColReader(0, pref=config.data_dir),
                        get_y=ColReader(1),
                        item_tfms=Resize(config.img_size)
                        )

In [None]:
dls = data_block.dataloaders(train_val_data)

In [None]:
dls.show_batch()

# Start Training

In [None]:
learn = cnn_learner(dls, config.model_arch, metrics=error_rate, 
                    cbs = [SaveModelCallback(monitor='error_rate', fname='best_model')])

In [None]:
lr_min, lr_steep = learn.lr_find()

In [None]:
learn.fit_one_cycle(config.num_freeze_epochs, 3e-3)

In [None]:
learn.recorder.plot_loss()

In [None]:
learn.unfreeze()
learn.lr_find()

In [None]:
learn.fit_one_cycle(config.num_epochs, lr_max=slice(3e-6,3e-4))

In [None]:
learn.recorder.plot_loss()

In [None]:
learning_results = [('final_train_loss', learn.final_record[0]), 
                    ('final_val_loss', learn.final_record[1]),
                    ('final_error_rate', learn.final_record[2])]

In [None]:
learning_results

# Inspect results

In [None]:
learn.show_results()

In [None]:
interp = Interpretation.from_learner(learn)

In [None]:
interp.plot_top_losses(16, figsize=(15,10))


In [None]:
class_interp = ClassificationInterpretation.from_learner(learn)
class_interp.plot_confusion_matrix(title='Confusion matrix', figsize=(10,10))

In [None]:
class_interp.most_confused(min_val=5)

# Make prediction on test set

In [None]:
test_folder = get_image_files(data_dir/"test")

In [None]:
model_path = 'models/' + str(config)+'.pkl'
learn.export(fname = model_path)

In [None]:
predictions = {'image': [], 'sports': []}
for image in tqdm(test_folder):
  pred = learn.predict(image)
  predictions['image'].append(str(image).replace('sports_classification/data','.'))
  predictions['sports'].append(pred[0])

In [None]:
pred_df = pd.DataFrame(predictions)

In [None]:
pred_df.image = pred_df.image.apply(lambda x : x.replace('/content/data', '.'))

In [None]:
test_prediction_file = 'test_evaluation.csv'
pred_df.to_csv(test_prediction_file, index=False)

# Perform TTA on model

Need to create two test dataloaders each covering one half of the test dataset (as tta is only applied on validation part, and we can't just have a dataloader with only validation). Run tta() over both those dataloaders and then concatenate the results.

In [None]:
def get_dummy_label(x):
  return 'football'

In [None]:
num_test_images = len(get_image_files(data_dir/'test'))

In [None]:
test_dblocks = [
                DataBlock(blocks=(ImageBlock, CategoryBlock),
                        get_items=get_image_files,
                        get_y=get_dummy_label,
                        item_tfms=Resize(2*config.img_size),
                        batch_tfms=aug_transforms(size=config.img_size, 
                                                 min_scale=0.75),
                        splitter = IndexSplitter(list(range(0, int(num_test_images/2))))
                        ),
                DataBlock(blocks=(ImageBlock, CategoryBlock),
                        get_items=get_image_files,
                        get_y=get_dummy_label,
                        item_tfms=Resize(2*config.img_size),
                        batch_tfms=aug_transforms(size=config.img_size, 
                                                 min_scale=0.75),
                        splitter = IndexSplitter(list(range(int(num_test_images/2), num_test_images)))
                        )
]

In [None]:
results = []
for dblock in test_dblocks:
  test_dls = dblock.dataloaders(source = data_dir/"test")
  learn.dls = test_dls
  results.append(learn.tta())

test_results = torch.cat((torch.argmax(results[0][0], axis=1),
                          torch.argmax(results[1][0], axis=1)), 0).numpy().tolist()
test_predictions = [dls.vocab[x] for x in test_results]

In [None]:
tta_predictions = {'image': get_image_files(data_dir/"test"), 'sports': test_predictions}
tta_pred_df = pd.DataFrame(tta_predictions)
tta_pred_df.image = tta_pred_df.image.apply(lambda x : str(x).replace('/content/data', '.'))

In [None]:
tta_test_prediction_file = 'tta_test_evaluation.csv'
tta_pred_df.to_csv(tta_test_prediction_file, index=False)

# Log the results

In [None]:
import mlflow
from mlflow import log_metric, log_param, log_artifacts,log_artifact

In [None]:
mlflow.set_tracking_uri(str(mlflow_dir))

In [None]:
def get_max_run_id(experiment_id):
    runs = mlflow.search_runs(experiment_ids=[experiment_id]) 
    run_id = len(runs)
    return run_id

def connect_to_experiment(experiment_name):
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        experiment_id = mlflow.create_experiment(experiment_name)
        run_id = 1 
    else:
        experiment_id = experiment.experiment_id 
        run_id = get_max_run_id(experiment_id) 
    return experiment_id, run_id

In [None]:
experiment_id, run_id = connect_to_experiment('sports_classification')
mlflow.start_run(run_name='sports_classification_run', experiment_id=experiment_id)

In [None]:
for config_tuple in config.mlflow_config():
  log_param(*config_tuple)

In [None]:
for result_tuple in learning_results:
  log_metric(*result_tuple)

In [None]:
log_artifact(model_path)
log_artifact(test_prediction_file)

In [None]:
mlflow.end_run()

# Updating mlruns on to google drive

In [None]:
!zip -r mlruns.zip mlruns

In [None]:
!rsync --info=progress2 mlruns.zip ./drive/My\ Drive/SportsClassification/ 

# Loading existing model for further work

In [None]:
# experiment = mlflow.get_experiment_by_name('sports_classification')
# assert(experiment is not None)

In [None]:
# runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id]) 

In [None]:
# runs

In [None]:
# artifact_uri = runs.artifact_uri.iloc[0]

In [None]:
# models = [str(f) for f in Path(artifact_uri).ls() if '.pkl' in str(f)]
# assert(len(models)==1)
# model = models[0]

In [None]:
#learn = load_learner(model)

# Visualize Activations for a prediction using Grad Cam