# General Data Cleaning Activities


## Cleaning CSI ranking data from imdb

In [1]:
import pandas as pd

In [4]:
ranking_file =  "metadata\CSI_imdb_ranking_synopsis.csv"
rankings = pd.read_csv(ranking_file, header=None)
print(rankings.head())

test_cell = rankings[1][0]
print(test_cell)

                    0                                                  1
0         For Warrick  1. CSI: Crime Scene Investigation (2000–2015) ...
1           One to Go  2. CSI: Crime Scene Investigation (2000–2015) ...
2         Living Doll  3. CSI: Crime Scene Investigation (2000–2015) ...
3  Girls Gone Gadgets  4. CSI: Crime Scene Investigation (2000–2015) ...
4           Gum Drops  5. CSI: Crime Scene Investigation (2000–2015) ...
1. CSI: Crime Scene Investigation (2000–2015) Episode: For Warrick (2008) TV-14 | 45 min | Crime, Drama, Mystery 9.0 Rate this CSIs past and present come together at the news of Warrick's death. But will Warrick's real killer successfully frame another police officer for his murder? Director: Richard J. Lewis | Stars: William Petersen , Marg Helgenberger , Gary Dourdan , George Eads Votes: 1,083


In [10]:
def cell_splitter(test_cell):
    test_cell = test_cell.replace('PG','14')
    test_cell = test_cell.replace('MA','14')
    rank, *r = test_cell.split('. CSI: Crime Scene Investigation (2000–2015) ')
    season, *r = r[0].split('Episode: ')
    name, *r = r[0].split(' TV-14 | ')
    name = name[:-7]
    r = r[0].split('Crime, Drama, Mystery ')[1:]
    rating, *r = r[0].split(' Rate this ')
    # synopsis, *r = r[0].split(' Director: ')
    # director, *r = r[0].split(' | Stars: ')
    # stars, *r = r[0].split(' Votes: ')
    # votes = r[0]
    # name =  test_cell.split()
    # print(rank, name, rating, sep='\n')
    return [rank, name, rating]

In [11]:
cell_splitter(test_cell)

['1', 'For Warrick', '9.0']

In [12]:
new_rankings = pd.DataFrame(rankings[1].apply(cell_splitter).to_list(), columns=['rank_order', 'epsisode', 'rank'])
print(new_rankings.head())

new_rankings.to_csv("metadata\CSI_imdb_ranking.csv")

  rank_order            epsisode rank
0          1         For Warrick  9.0
1          2           One to Go  8.9
2          3         Living Doll  8.9
3          4  Girls Gone Gadgets  8.8
4          5           Gum Drops  8.8


## Cleaning CRT files

In [38]:
import pysrt
import re

In [69]:
def sub_stripper(f_name):
    """processing for srt files from CSI
       removes music and combines captions that span a sentacne

    Args:
        f_name (str): *.srt file to process

    Returns:
        pd.Dataframe : datframe with start, end, and text columns
    """
    subs = pysrt.open(f_name)
    continued = False
    new_subs = []
    for s in subs:
        
        text = s.text.replace('<i>', ' ').replace('</i>','').replace('\n', ' ').replace('\xa0',' ').replace('"','')
        
        # pattern = '[A-Z][a-z]*:'
        # text = re.sub(pattern, '', text)

        text = text.split(':')
        text = text[0] if len(text)==1 else text[1]
        text = text.lstrip()

        if not text:
            continue

        if text[0] == '♪' or (text[0] == '(' and text[-1] == ')'):
            continue

        if not continued:
            # print(s.start)
            start = s.start
            sub_string = ''

        if text[-1] in ['.','?',"!", ')']:
            sub_string += text
            sub_string = sub_string.replace('  ', ' ')
            # print(sub_string, end = '\n')
            # print(s.end)
            continued = False
            new_subs.append([start.to_time().replace(microsecond=0), s.end.to_time().replace(microsecond=0), sub_string])
        else:
            continued = True
            sub_string += text + ' '
            # print(text, end=' ')
    
    return pd.DataFrame(new_subs, columns = ['start', 'end', 'text'])


In [67]:

# get path to all srt files
from glob import glob
srt_files = glob('CSI - Crime Scene Investigation/**/*.srt', recursive=True)
test_file = srt_files[0]

In [70]:
sub_stripper(test_file).loc[135]

start    00:09:31
end      00:09:33
text     Ma'am...
Name: 135, dtype: object

In [72]:
# process all the files
path_cleaned_subs = 'cleaned_subs\\'
for sub_file in srt_files:
    episode = sub_file.split(' - ')[-1].split(' ')[0]
    df = sub_stripper(sub_file)
    df.to_csv(path_cleaned_subs + episode, sep='\t')

# Joining cleaned subs and Labels from CSI