In [1]:
import pandas as pd
from nltk.translate import bleu_score
from jiwer import wer
import seaborn as sns
import numpy as np
import time, datetime

In [2]:
videos = pd.read_csv("data/video_list.csv")

In [3]:
# replace weird year with nan
videos['year'].loc[(videos.year > 3000)] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  videos['year'].loc[(videos.year > 3000)] = np.nan


In [4]:
# add transcripts into the dataframe

transcripts = pd.DataFrame(columns = ['id', 'computer_transcript', 'human_transcript'])  

# will add and join human transcripts here too when we have them 

for id in videos.id:
        
    try:
        #open text file in read mode
        human_transcript_file = open("human_transcripts/%s.txt" % id, "r")
        
        #read whole file to a string
        human_transcript = human_transcript_file.read()
        
        #close file
        human_transcript_file.close()
        
        #open text file in read mode
        computer_transcript_file = open("computer_transcripts/%s.txt" % id, "r")
        
        #read whole file to a string
        computer_transcript = computer_transcript_file.read()
        
        #close file
        computer_transcript_file.close()
        
        #add row to transcripts dataframe
        transcripts.loc[len(transcripts.index)] = [id, computer_transcript, human_transcript]
        
    except IOError:
        print("%s.txt not accessible" % id )
        
        
videos = videos.merge(transcripts, on='id', how='left')
    

tobacco_qar62a00.txt not accessible
tobacco_ldo23e00.txt not accessible
tobacco_gxu03f00.txt not accessible
tobacco_gav28d00.txt not accessible
tobacco_qyq95i00.txt not accessible
tobacco_lxkv0152.txt not accessible
tobacco_byv27a00.txt not accessible
tobacco_kpp06a00.txt not accessible
tobacco_mnjp0149.txt not accessible
tobacco_mpp06a00.txt not accessible
tobacco_dlm09c00.txt not accessible


In [5]:
def clean_transcript(df, transcript_col):
    
    # if a comma is followed by anything other than a space, replace it with comma space
    df[transcript_col] = df[transcript_col].str.replace("(?<=[.,])(?=[^\s])"," ")
    
    # remove commas, semicolons, dashes
    df[transcript_col] = df[transcript_col].str.replace(";","")
    df[transcript_col] = df[transcript_col].str.replace(":","")
    df[transcript_col] = df[transcript_col].str.replace(",","")
    df[transcript_col] = df[transcript_col].str.replace("-","")
    
    # Remove sentence enders
    df[transcript_col] = df[transcript_col].str.replace("!","")
    df[transcript_col] = df[transcript_col].str.replace("?","")
    df[transcript_col] = df[transcript_col].str.replace("\.{2,}","")
    
    # convert all to lowercase
    df[transcript_col] = df[transcript_col].str.lower()
    
    # turn the transcript strings into lists of words 
    # split by spaces
    #df[transcript_col] = df[transcript_col].str.split(' ')

In [6]:
clean_transcript(videos, "computer_transcript")

  df[transcript_col] = df[transcript_col].str.replace("(?<=[.,])(?=[^\s])"," ")
  df[transcript_col] = df[transcript_col].str.replace("?","")
  df[transcript_col] = df[transcript_col].str.replace("\.{2,}","")


In [7]:
clean_transcript(videos, "human_transcript")

  df[transcript_col] = df[transcript_col].str.replace("(?<=[.,])(?=[^\s])"," ")
  df[transcript_col] = df[transcript_col].str.replace("?","")
  df[transcript_col] = df[transcript_col].str.replace("\.{2,}","")


In [8]:
# add sentiment scores
sentiment_scores = pd.read_csv("data/computer_human_sentiment_scores.csv")
# remove .txt from filename and rename id 
sentiment_scores.file_name = sentiment_scores['file_name'].str.replace(".txt","")
sentiment_scores = sentiment_scores.rename(columns = dict(file_name = 'id'))
sentiment_scores = sentiment_scores.drop(columns = ['file_name.1'])

# join to videos
videos = videos.merge(sentiment_scores, on='id', how='left')

  sentiment_scores.file_name = sentiment_scores['file_name'].str.replace(".txt","")


In [9]:
# calculate Word Accuracy Rate
videos = videos.dropna(axis = 0, subset = 'human_transcript')

def calculate_war(row):
    return 1 - (wer(row['human_transcript'], row['computer_transcript']))

videos['war'] = videos.apply(lambda row: calculate_war(row), axis =1 )

In [10]:
# calculate BLEU score
chencherry = bleu_score.SmoothingFunction()

def calculate_bleu(row):
    return bleu_score.sentence_bleu(row['human_transcript'].split(), row['computer_transcript'].split(), smoothing_function=chencherry.method1)

videos['bleu_score'] = videos.apply(lambda row: calculate_bleu(row), axis =1 )

In [11]:
# convert runtime column to seconds for easier correlations

videos.runtime = pd.to_timedelta(videos['runtime']).dt.total_seconds()


In [12]:
videos.corr()

Unnamed: 0,runtime,year,automl_confidence_avg,automl_confidence_min,automl_confidence_max,sentiment,magnitude,human_sentiment,human_magnitude,war,bleu_score
runtime,1.0,0.547801,0.145547,-0.563742,0.346913,-0.387981,0.887973,-0.532274,-0.2643,0.495872,-0.320941
year,0.547801,1.0,0.483801,-0.153729,0.36043,-0.499006,0.589208,-0.610186,-0.480975,0.71093,-0.292201
automl_confidence_avg,0.145547,0.483801,1.0,0.35132,0.387133,-0.422837,0.129264,-0.283685,-0.487142,0.748043,-0.284909
automl_confidence_min,-0.563742,-0.153729,0.35132,1.0,-0.451405,0.121012,-0.36982,0.261859,-0.430108,-0.080677,0.47424
automl_confidence_max,0.346913,0.36043,0.387133,-0.451405,1.0,-0.562342,0.284736,-0.470399,0.201579,0.525152,-0.936515
sentiment,-0.387981,-0.499006,-0.422837,0.121012,-0.562342,1.0,-0.368069,0.799116,0.20375,-0.462527,0.420624
magnitude,0.887973,0.589208,0.129264,-0.36982,0.284736,-0.368069,1.0,-0.558816,-0.26594,0.49081,-0.271134
human_sentiment,-0.532274,-0.610186,-0.283685,0.261859,-0.470399,0.799116,-0.558816,1.0,0.210845,-0.533919,0.427259
human_magnitude,-0.2643,-0.480975,-0.487142,-0.430108,0.201579,0.20375,-0.26594,0.210845,1.0,-0.502425,-0.237209
war,0.495872,0.71093,0.748043,-0.080677,0.525152,-0.462527,0.49081,-0.533919,-0.502425,1.0,-0.500573


In [13]:
videos

Unnamed: 0,id,runtime,category,url,year,fellow_accuracy_rating,automl_confidence_avg,automl_confidence_min,automl_confidence_max,computer_transcript,human_transcript,sentiment,magnitude,human_sentiment,human_magnitude,war,bleu_score
0,tobacco_rdz99d00,89.0,Advertising,https://archive.org/download/tobacco_rdz99d00/...,1966.0,Poor,0.765765,0.758432,0.773098,then is the newport a welcome place never hush...,﻿smooth and fresh is the newport taste. welcom...,0.8,0.8,0.8,5.7,0.384615,0.007266
1,tobacco_amp91f00,325.0,Advertising,https://archive.org/download/tobacco_amp91f00/...,1994.0,,0.844541,0.759544,0.912708,when i think about it most of my friends who s...,﻿when i think about it most of my friends who ...,-0.8,2.4,-0.3,17.4,0.882171,0.000449
2,tobacco_xpu03f00,688.0,Advertising,https://archive.org/download/tobacco_xpu03f00/...,1961.0,Good,0.813684,0.548756,0.910613,hardly folks if you have never smoked raleigh ...,﻿pardon me folks if you have never smoked rale...,0.3,2.7,0.3,70.0,0.711312,0.000203
5,tobacco_lez99d00,96.0,Advertising,https://archive.org/download/tobacco_lez99d00/...,1968.0,Excellent,0.866173,0.845815,0.88653,he's an independent guy he likes to set his ow...,﻿he's an independent guy he likes to set his o...,-0.4,0.4,0.2,2.3,0.755906,0.001986
6,tobacco_tpu03f00,642.0,Advertising,https://archive.org/download/tobacco_tpu03f00/...,1966.0,Fair,0.793365,0.579098,0.904231,tackles aboard 38 men all around that get unde...,﻿shackles aboard 38 men all around to get unde...,0.3,1.7,0.5,40.5,0.634725,0.000269
7,tobacco_nou03f00,398.0,Advertising,https://archive.org/download/tobacco_nou03f00/...,1961.0,Poor,0.763748,0.65261,0.904791,i hope twice's refresh don't fight and refresh...,smoke belair smoke twice as refresh smoke twic...,0.2,0.8,0.2,31.799999,0.351767,0.00042
8,tobacco_szy99d00,59.0,Advertising,https://archive.org/download/tobacco_szy99d00/...,1968.0,Good,0.820326,0.820326,0.820326,sarah patron whose name was mcnair as the barb...,﻿so the patron whose name was mcnair as the ba...,0.9,0.9,0.2,1.1,0.753846,0.003148
16,tobacco_kou03f00,443.0,Advertising,https://archive.org/download/tobacco_kou03f00/...,1957.0,Good,0.801213,0.707425,0.87761,say here's something important for you filter ...,say here's something important for you filter ...,0.7,3.8,0.6,16.299999,0.817708,0.000242
17,tobacco_kpr91e00,159.0,Advertising,https://archive.org/download/tobacco_kpr91e00/...,1968.0,Excellent,0.891701,0.874894,0.912838,this is living some people don't think so so t...,this is living some people don't think so so t...,0.0,0.0,0.1,1.4,0.951724,0.001453
18,tobacco_ohq03d00,1137.0,Advertising,https://archive.org/download/tobacco_ohq03d00/...,1988.0,Good,0.867385,0.632409,0.912839,breakthroughs and cigarette technology beginni...,breakthroughs and cigarette technology beginni...,0.4,4.7,0.4,17.299999,0.96013,0.000249


In [14]:
videos.to_csv('data/final_dataset.csv')