# Transcript Data Analytics

In [1]:
import os
import json
import typing

In [2]:
ROOT_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
print(f"ROOT_DIR: {ROOT_DIR}")
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'podcasts-transcript', 'spotify-podcasts-2020')
print(f"DATA_DIR: {DATA_DIR}")
TRANSCRIPT_ROOT_DIR = os.path.join(DATA_DIR, 'podcasts-transcripts')
print(f"TRANSCRIPT_ROOT_DIR: {TRANSCRIPT_ROOT_DIR}")

ROOT_DIR: /home/erik/Projects/KTH/dd2476-podcast-search
DATA_DIR: /home/erik/Projects/KTH/dd2476-podcast-search/data/podcasts-transcript/spotify-podcasts-2020
TRANSCRIPT_ROOT_DIR: /home/erik/Projects/KTH/dd2476-podcast-search/data/podcasts-transcript/spotify-podcasts-2020/podcasts-transcripts


In [None]:
with open(os.path.join(DATA_DIR, 'file_list.txt'), 'r') as f:
    first_line = next(f)

print(first_line)
# first_line = './podcasts-transcripts/4/J/show_4jJdofaAzXkKpsFJ8wGS9I/2i074A63WWvH4Vc279IQiX.json'


Pick a sample transcript JSON file

In [None]:
SAMPLE_PATH = os.path.join(DATA_DIR, first_line[2:].replace("\n", ''))

with open(SAMPLE_PATH, 'r') as f:
    data = json.load(f)`

In [None]:
alternatives = data['results'][0]['alternatives']
alternatives[0].keys()

## Structure of the transcript JSON file

* The file is a dict contianing a single key `results`, which is a list of items
* Each item in the list is a dictionary that has only one key `alternatives`
* This `alternatives` key refer to a list of **alternative** objects
* Each **alternative** is a dict whose keys include `transcript`, `confidence`, `words`
* `transcript`: an piece of texts corrresponding to a small portion of the transcript of each episode
* `confidence`: double
* `words`: a list of word token objects, each object has attributes `startTime`, `endTime` and `word`

Structure of the transcript JSON

In [None]:
results = data['results']
len(results)

### 1. A `result` item

Each `result` item contains a single key called `alternatives` which is a list

In [None]:
result = results[0]
result.keys()

### 2. An `alternative` item

Each `alternative` item is dictionary containing three keys
* `transcript`: an excerpt of the transcript
* `confidence`: confidence of the transcript (I guess it indicates how accurate the generated texts are)
* `words`: a list of JSON objects

In [None]:
alternatives = result['alternatives']
len(alternatives)

In [None]:
alternative = alternatives[0]
alternative.keys()

In [None]:
alternative['transcript']

In [None]:
alternative['confidence']

In [None]:
len(alternative['words'])

### 3. A `word` item

Each word item is dict containing 3 required keys `startTime`, `endTime` and `word`. In some alternatives, a word can also contain the `speakerTag`

In [None]:
alternative['words'][:10]

## Anomaly detection

We suspect that not all **alternatives** have the above structure, let's see if there is something weird

In [None]:
multi_al_results = [res for res in results if len(res['alternatives']) != 1]
multi_al_results

In [None]:
res_alternatives = [res['alternatives'][0] for res in results]
len(res_alternatives)

In [None]:
for alt in res_alternatives:
    if 'transcript' not in alt.keys():
        print("Alternative doesn't have `transcript` key")
        print("No. of words: ", len(alt['words']))

## Conclusion

After the above exploratory analysis, we conclude that:
* Each JSON transcript file hold a single dictionary that contains a single key `results`
* The `results` is a list of dictionary that contains a single key `alternatives` which in tern is a list of object
* Each `alternatives` contains a list of dictionary, dubbed `alternative`
* There are **two** types of `alternative`:
  - Type 1: a dictionary containing **three** keys: `transcript`, `confidence`, and `words`
  - Type 2: a dictionary containing **only one** key `words` that hold all the word tokens in the transcript of a podcast

In [None]:
count = 0
# for (dirpath, dirname, filenames) in os.walk(os.path.join(DATA_DIR, 'podcasts-transcripts')):
#     print(dirname, filenames)
#     count += 1
#     if count > 100:
#         break
json_files = []
PODCAST_DIR = os.path.join(DATA_DIR, 'podcasts-transcripts')
for seg_name in os.listdir(PODCAST_DIR):  # Segments of transcript dataset
    seg_path = os.path.join(PODCAST_DIR, seg_name)
    for alpha_subdir in os.listdir(seg_path):  # alphabet subdirectory
        alpha_subdir_path = os.path.join(seg_path, alpha_subdir)
        for show_dir in os.listdir(alpha_subdir_path):  # show directory
            if show_dir != 'show_0XDDRp9nP5S3kgx413Ixg3':
                continue
            show_path = os.path.join(alpha_subdir_path, show_dir)
            for fname in os.listdir(show_path):
                if not fname.endswith(".json"):
                    continue
                json_files.append(os.path.join(seg_name, alpha_subdir, show_dir, fname))

print("Number of json file", len(json_files))

# with open(os.path.join(DATA_DIR, "json_file_list.txt"), "w") as f:
#     for json_file in json_files:
#         f.write(json_file + "\n")

In [None]:
# len(json_files)
json_files[0]

## Analyze a sample show id=`0XDDRp9nP5S3kgx413Ixg3`

In [None]:
show_id = '0XDDRp9nP5S3kgx413Ixg3'
show_prefix = 'show_' + show_id
show_dir = os.path.join(DATA_DIR, 'podcasts-transcripts/0/X/show_0XDDRp9nP5S3kgx413Ixg3')
fnames = [fn for fn in os.listdir(show_dir) if fn.endswith(".json")]
len(fnames)

In [None]:
def read_json_file(fpath: str) -> typing.Any:
    with open(fpath, 'r') as f:
        data = json.load(f)
    
    return data

def is_empty_alternative(alternatives: typing.List[typing.Any]) -> bool:
    """Check whether an `alternatives` list is empty
    It's empty if it contains a single dictionary
    """
    if len(alternatives) != 1:
        return False
    
    return not bool(alternatives[0])

### Sample episode: id = `1qgr1zTevH7IOvK4My30ht`

Episode 70: Make Your Opponent Hit a First Volley

In [None]:
episode_id = '1qgr1zTevH7IOvK4My30ht'
episode_path = os.path.join(show_dir, episode_id + '.json')

episode = read_json_file(episode_path)

In [None]:
sample_show_json_list = []

with open(os.path.join(DATA_DIR, 'file_list.txt'), 'r') as f:
    for line in f:
        if show_prefix in line.strip():
            sample_show_json_list.append(line.strip())
len(sample_show_json_list)

In [None]:
episode['results'][0]['alternatives'][0]

In [None]:
episode_id = '2GW6G1xC9RT4eUDuLgZMB5'
episode_path = os.path.join(show_dir, episode_id + '.json')

episode = read_json_file(episode_path)

In [None]:
episode['results'][0]['alternatives'][0]