In [1]:
import os
import json
import pandas as pd
import datetime
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from vader_sentences_processing import SentimentIntensityFromFile, Sentence

The following code uploads files for available leagues apart from Premier League (we extracted the files within the main EDA notebook for Vader model analysis). We have decided to extract leagues separately in order to control each stage and check the correctness of the returned dataframes and .csv files. We have created two classes for extracting and storing information about the files: SentimentIntensityFromFile, Sentence (see the file *vader_sentences_processing.py*).

# Champions League

In [2]:
file_number=0
files_data2 = []

for directory, dirnames, filenames in os.walk(r'transcriptions\europe_uefa-champions-league'):
    if filenames==[]:
        continue
    if 'other' in directory:
        continue

    for i in filenames:
        if i.endswith('.csv'):
            continue
        if file_number%25==0:
            print(os.path.join(directory,i))

        file_number += 1
        curr_file = SentimentIntensityFromFile(filename=i, path=directory)
        curr_file.load_json()
        curr_file.get_sentiments_from_sentences()
        files_data2.append(curr_file)

transcriptions\europe_uefa-champions-league\2014-2015\2014-11-04 - 20-00 Zenit Petersburg 1 - 2 Bayer Leverkusen\1_224p_medium_asr.json
transcriptions\europe_uefa-champions-league\2014-2015\2014-12-10 - 22-45 Barcelona 3 - 1 Paris SG\2_224p_medium_asr.json
transcriptions\europe_uefa-champions-league\2014-2015\2015-03-18 - 22-45 Dortmund 0 - 3 Juventus\1_224p_medium_asr.json
transcriptions\europe_uefa-champions-league\2015-2016\2015-09-15 - 21-45 Galatasaray 0 - 2 Atl. Madrid\2_224p_medium_asr.json
transcriptions\europe_uefa-champions-league\2015-2016\2015-09-29 - 21-45 Barcelona 2 - 1 Bayer Leverkusen\1_224p_medium_asr.json
transcriptions\europe_uefa-champions-league\2015-2016\2015-11-03 - 22-45 Shakhtar Donetsk 4 - 0 Malmo FF\2_224p_medium_asr.json
transcriptions\europe_uefa-champions-league\2015-2016\2015-11-25 - 22-45 Atl. Madrid 2 - 0 Galatasaray\1_224p_medium_asr.json
transcriptions\europe_uefa-champions-league\2016-2017\2016-10-19 - 21-45 Barcelona 4 - 0 Manchester City\2_224p_me

In [3]:
start_time = []
end_time = []
sentences_text = []
sentiment_positive = []
sentiment_negative = []
sentiment_neutral = []
sentiment_compound = []
folder_names = []
languages_orig = []

for file in files_data2:    
    for sent in file.sentences:
        start_time.append(sent.start_time)
        end_time.append(sent.end_time)
        sentences_text.append(sent.text)
        sentiment_positive.append(sent.positive)
        sentiment_negative.append(sent.negative)
        sentiment_neutral.append(sent.neutral)
        sentiment_compound.append(sent.compound)
        folder_names.append(file.folder_name)

#preparing dataframe with information from the files
df_champ_league = pd.DataFrame({'MATCH_FOLDER':folder_names, 'START_TIME':start_time,
                    'END_TIME':end_time, 'TEXT':sentences_text,
                    'POSITIVE':sentiment_positive,
                    'NEGATIVE':sentiment_negative,
                    'NEUTRAL':sentiment_neutral,
                    'COMPOUND':sentiment_compound})


In [4]:
#number of files processed on that stage
file_number

216

In [6]:
#exporting the results to the .csv file
df_champ_league.to_csv(r'transcriptions\europe_uefa-champions-league\europe_uefa-champions-league_sentences_vader.csv', sep=';')

# Ligue 1

In [7]:
file_number=0
files_data3 = []

for directory, dirnames, filenames in os.walk(r'transcriptions\france_ligue-1'):
    if filenames==[]:
        continue
    if 'other' in directory:
        continue

    for i in filenames:
        if i.endswith('.csv'):
            continue
        if file_number%25==0:
            print(os.path.join(directory,i))

        file_number += 1
        curr_file = SentimentIntensityFromFile(filename=i, path=directory)
        curr_file.load_json()
        curr_file.get_sentiments_from_sentences()
        files_data3.append(curr_file)

transcriptions\france_ligue-1\2014-2015\2015-04-05 - 22-00 Marseille 2 - 3 Paris SG\1_224p_medium_asr.json
transcriptions\france_ligue-1\2016-2017\2016-10-01 - 18-00 Paris SG 2 - 0 Bordeaux\2_224p_medium_asr.json
transcriptions\france_ligue-1\2016-2017\2016-12-17 - 19-00 Guingamp 2 - 1 Paris SG\1_224p_medium_asr.json
transcriptions\france_ligue-1\2016-2017\2017-04-02 - 18-00 Nice 2 - 1 Bordeaux\2_224p_medium_asr.json


In [8]:
#number of files processed for Ligue 1
file_number

94

In [9]:
start_time = []
end_time = []
sentences_text = []
sentiment_positive = []
sentiment_negative = []
sentiment_neutral = []
sentiment_compound = []
folder_names = []
languages_orig = []

for file in files_data3:    
    for sent in file.sentences:
        start_time.append(sent.start_time)
        end_time.append(sent.end_time)
        sentences_text.append(sent.text)
        sentiment_positive.append(sent.positive)
        sentiment_negative.append(sent.negative)
        sentiment_neutral.append(sent.neutral)
        sentiment_compound.append(sent.compound)
        folder_names.append(file.folder_name)

df_league1 = pd.DataFrame({'MATCH_FOLDER':folder_names, 'START_TIME':start_time,
                    'END_TIME':end_time, 'TEXT':sentences_text,
                    'POSITIVE':sentiment_positive,
                    'NEGATIVE':sentiment_negative,
                    'NEUTRAL':sentiment_neutral,
                    'COMPOUND':sentiment_compound})


In [12]:
#export dataframe to .csv file for future analysis
df_league1.to_csv(r'transcriptions\france_ligue-1\france_ligue-1_sentences_vader.csv', sep=';')

# Bundesliga

In [13]:
file_number=0
files_data4 = []

for directory, dirnames, filenames in os.walk(r'transcriptions\germany_bundesliga'):
    if filenames==[]:
        continue
    if 'other' in directory:
        continue

    for i in filenames:
        if i.endswith('.csv'):
            continue
        if file_number%25==0:
            print(os.path.join(directory,i))

        file_number += 1
        curr_file = SentimentIntensityFromFile(filename=i, path=directory)
        curr_file.load_json()
        curr_file.get_sentiments_from_sentences()
        files_data4.append(curr_file)

transcriptions\germany_bundesliga\2014-2015\2015-02-21 - 17-30 Paderborn 0 - 6 Bayern Munich\1_224p_medium_asr.json
transcriptions\germany_bundesliga\2015-2016\2015-09-19 - 16-30 Darmstadt 0 - 3 Bayern Munich\2_224p_medium_asr.json
transcriptions\germany_bundesliga\2015-2016\2016-04-23 - 16-30 Hertha Berlin 0 - 2 Bayern Munich\1_224p_medium_asr.json
transcriptions\germany_bundesliga\2016-2017\2016-11-19 - 20-30 Dortmund 1 - 0 Bayern Munich\2_224p_medium_asr.json
transcriptions\germany_bundesliga\2016-2017\2017-02-11 - 17-30 Darmstadt 2 - 1 Dortmund\1_224p_medium_asr.json


In [16]:
start_time = []
end_time = []
sentences_text = []
sentiment_positive = []
sentiment_negative = []
sentiment_neutral = []
sentiment_compound = []
folder_names = []
languages_orig = []

for file in files_data4:    
    for sent in file.sentences:
        start_time.append(sent.start_time)
        end_time.append(sent.end_time)
        sentences_text.append(sent.text)
        sentiment_positive.append(sent.positive)
        sentiment_negative.append(sent.negative)
        sentiment_neutral.append(sent.neutral)
        sentiment_compound.append(sent.compound)
        folder_names.append(file.folder_name)

df_bundesliga = pd.DataFrame({'MATCH_FOLDER':folder_names, 'START_TIME':start_time,
                    'END_TIME':end_time, 'TEXT':sentences_text,
                    'POSITIVE':sentiment_positive,
                    'NEGATIVE':sentiment_negative,
                    'NEUTRAL':sentiment_neutral,
                    'COMPOUND':sentiment_compound})


In [17]:
#comparing shapes of the dataframes - each observation contains one segment extracted by Whisper
df_league1.shape, df_bundesliga.shape

((77314, 8), (38448, 8))

In [18]:
#export to csv
df_bundesliga.to_csv(r'transcriptions\germany_bundesliga\germany_bundesliga_sentences_vader.csv', sep=';')

# Serie A

In [19]:
file_number=0
files_data5 = []

for directory, dirnames, filenames in os.walk(r'transcriptions\italy_serie-a'):
    if filenames==[]:
        continue
    if 'other' in directory:
        continue

    for i in filenames:
        if i.endswith('.csv'):
            continue
        if file_number%25==0:
            print(os.path.join(directory,i))

        file_number += 1
        curr_file = SentimentIntensityFromFile(filename=i, path=directory)
        curr_file.load_json()
        curr_file.get_sentiments_from_sentences()
        files_data5.append(curr_file)

transcriptions\italy_serie-a\2014-2015\2015-02-15 - 14-30 AC Milan 1 - 1 Empoli\1_224p_medium_asr.json
transcriptions\italy_serie-a\2015-2016\2015-09-20 - 13-30 Chievo 0 - 1 Inter\2_224p_medium_asr.json
transcriptions\italy_serie-a\2016-2017\2016-08-28 - 21-45 Cagliari 2 - 2 AS Roma\1_224p_medium_asr.json
transcriptions\italy_serie-a\2016-2017\2016-09-24 - 21-45 Napoli 2 - 0 Chievo\2_224p_medium_asr.json
transcriptions\italy_serie-a\2016-2017\2016-10-30 - 17-00 Empoli 0 - 0 AS Roma\1_224p_medium_asr.json
transcriptions\italy_serie-a\2016-2017\2016-12-12 - 23-00 AS Roma 1 - 0 AC Milan\2_224p_medium_asr.json
transcriptions\italy_serie-a\2016-2017\2017-02-12 - 14-30 Crotone 0 - 2 AS Roma\1_224p_medium_asr.json
transcriptions\italy_serie-a\2016-2017\2017-04-02 - 21-45 Napoli 1 - 1 Juventus\2_224p_medium_asr.json
transcriptions\italy_serie-a\2016-2017\2017-05-20 - 19-00 Chievo 3 - 5 AS Roma\1_224p_medium_asr.json


In [20]:
#number of filer for Serie A
file_number

208

In [21]:
start_time = []
end_time = []
sentences_text = []
sentiment_positive = []
sentiment_negative = []
sentiment_neutral = []
sentiment_compound = []
folder_names = []
languages_orig = []

for file in files_data5:    
    for sent in file.sentences:
        start_time.append(sent.start_time)
        end_time.append(sent.end_time)
        sentences_text.append(sent.text)
        sentiment_positive.append(sent.positive)
        sentiment_negative.append(sent.negative)
        sentiment_neutral.append(sent.neutral)
        sentiment_compound.append(sent.compound)
        folder_names.append(file.folder_name)

df_serieA = pd.DataFrame({'MATCH_FOLDER':folder_names, 'START_TIME':start_time,
                    'END_TIME':end_time, 'TEXT':sentences_text,
                    'POSITIVE':sentiment_positive,
                    'NEGATIVE':sentiment_negative,
                    'NEUTRAL':sentiment_neutral,
                    'COMPOUND':sentiment_compound})


In [22]:
#export to csv
df_serieA.to_csv(r'transcriptions\italy_serie-a\italy_serie-a_sentences_vader.csv', sep=';')

# La Liga

In [24]:
file_number=0
files_data6 = []

for directory, dirnames, filenames in os.walk(r'transcriptions\spain_laliga'):
    if filenames==[]:
        continue
    if 'other' in directory:
        continue

    for i in filenames:
        if i.endswith('.csv'):
            continue
        if file_number%25==0:
            print(os.path.join(directory,i))

        file_number += 1
        curr_file = SentimentIntensityFromFile(filename=i, path=directory)
        curr_file.load_json()
        curr_file.get_sentiments_from_sentences()
        files_data6.append(curr_file)

transcriptions\spain_laliga\2014-2015\2015-02-14 - 20-00 Real Madrid 2 - 0 Dep. La Coruna\1_224p_medium_asr.json
transcriptions\spain_laliga\2014-2015\2015-05-09 - 19-00 Barcelona 2 - 0 Real Sociedad\2_224p_medium_asr.json
transcriptions\spain_laliga\2015-2016\2015-09-26 - 19-15 Real Madrid 0 - 0 Malaga\1_224p_medium_asr.json
transcriptions\spain_laliga\2015-2016\2016-01-24 - 22-30 Betis 1 - 1 Real Madrid\2_224p_medium_asr.json
transcriptions\spain_laliga\2015-2016\2016-04-20 - 23-00 Real Madrid 3 - 0 Villarreal\1_224p_medium_asr.json
transcriptions\spain_laliga\2016-2017\2016-09-21 - 23-00 Barcelona 1 - 1 Atl. Madrid\2_224p_medium_asr.json
transcriptions\spain_laliga\2016-2017\2016-11-27 - 22-45 Real Sociedad 1 - 1 Barcelona\1_224p_medium_asr.json
transcriptions\spain_laliga\2016-2017\2017-02-18 - 18-15 Real Madrid 2 - 0 Espanyol\2_224p_medium_asr.json
transcriptions\spain_laliga\2016-2017\2017-04-02 - 21-45 Granada CF 1 - 4 Barcelona\1_224p_medium_asr.json
transcriptions\spain_laliga

In [25]:
#number of files
file_number

234

In [26]:
start_time = []
end_time = []
sentences_text = []
sentiment_positive = []
sentiment_negative = []
sentiment_neutral = []
sentiment_compound = []
folder_names = []
languages_orig = []

for file in files_data6:    
    for sent in file.sentences:
        start_time.append(sent.start_time)
        end_time.append(sent.end_time)
        sentences_text.append(sent.text)
        sentiment_positive.append(sent.positive)
        sentiment_negative.append(sent.negative)
        sentiment_neutral.append(sent.neutral)
        sentiment_compound.append(sent.compound)
        folder_names.append(file.folder_name)
        # languages_orig.append(file.original_language)

df_laliga = pd.DataFrame({'MATCH_FOLDER':folder_names, 'START_TIME':start_time,
                    'END_TIME':end_time, 'TEXT':sentences_text,
                    'POSITIVE':sentiment_positive,
                    'NEGATIVE':sentiment_negative,
                    'NEUTRAL':sentiment_neutral,
                    'COMPOUND':sentiment_compound})


In [27]:
#export to csv
df_laliga.to_csv(r'transcriptions\spain_laliga\spain_laliga_sentences_vader.csv', sep=';')