# Transcript Data Analytics

In [2]:
import os
import json

In [4]:
ROOT_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
DATA_DIR = os.path.join(ROOT_DIR, 'data', 'podcasts-transcript', 'spotify-podcasts-2020')
TRANSCRIPT_ROOT_DIR = os.path.join(DATA_DIR, 'podcasts-transcripts')

In [19]:
# with open(os.path.join(DATA_DIR, 'file_list.txt'), 'r') as f:
#     first_line = next(f)

# print(first_line)
first_line = './podcasts-transcripts/4/J/show_4jJdofaAzXkKpsFJ8wGS9I/2i074A63WWvH4Vc279IQiX.json'


Pick a sample transcript JSON file

In [20]:
SAMPLE_PATH = os.path.join(DATA_DIR, first_line[2:].replace("\n", ''))

with open(SAMPLE_PATH, 'r') as f:
    data = json.load(f)`

In [21]:
alternatives = data['results'][0]['alternatives']
alternatives[0].keys()

dict_keys([])

## Structure of the transcript JSON file

* The file is a dict contianing a single key `results`, which is a list of items
* Each item in the list is a dictionary that has only one key `alternatives`
* This `alternatives` key refer to a list of **alternative** objects
* Each **alternative** is a dict whose keys include `transcript`, `confidence`, `words`
* `transcript`: an piece of texts corrresponding to a small portion of the transcript of each episode
* `confidence`: double
* `words`: a list of word token objects, each object has attributes `startTime`, `endTime` and `word`

Structure of the transcript JSON

In [22]:
results = data['results']
len(results)

9

### 1. A `result` item

Each `result` item contains a single key called `alternatives` which is a list

In [23]:
result = results[0]
result.keys()

dict_keys(['alternatives'])

### 2. An `alternative` item

Each `alternative` item is dictionary containing three keys
* `transcript`: an excerpt of the transcript
* `confidence`: confidence of the transcript (I guess it indicates how accurate the generated texts are)
* `words`: a list of JSON objects

In [24]:
alternatives = result['alternatives']
len(alternatives)

1

In [25]:
alternative = alternatives[0]
alternative.keys()

dict_keys([])

In [16]:
alternative['transcript']

"Hello and welcome to the first episode of The Fan into flame podcast the official podcast of Greenwood Mennonite School athletics. My name is Tyler warfel. I teach high school math here at GMS and Coach the High School boys basketball team. I'm a big podcast listener. I listen to a lot of different types of podcasts mostly Sports podcast, but also some teaching podcast some news and current events podcasts. I like True Crime podcasts."

In [17]:
alternative['confidence']

0.853115439414978

In [19]:
len(alternative['words'])

76

### 3. A `word` item

Each word item is dict containing 3 required keys `startTime`, `endTime` and `word`. In some alternatives, a word can also contain the `speakerTag`

In [20]:
alternative['words'][:10]

[{'startTime': '1.400s', 'endTime': '1.700s', 'word': 'Hello'},
 {'startTime': '1.700s', 'endTime': '1.800s', 'word': 'and'},
 {'startTime': '1.800s', 'endTime': '2.200s', 'word': 'welcome'},
 {'startTime': '2.200s', 'endTime': '2.400s', 'word': 'to'},
 {'startTime': '2.400s', 'endTime': '2.400s', 'word': 'the'},
 {'startTime': '2.400s', 'endTime': '2.800s', 'word': 'first'},
 {'startTime': '2.800s', 'endTime': '3.500s', 'word': 'episode'},
 {'startTime': '3.500s', 'endTime': '3.600s', 'word': 'of'},
 {'startTime': '3.600s', 'endTime': '3.800s', 'word': 'The'},
 {'startTime': '3.800s', 'endTime': '4.300s', 'word': 'Fan'}]

## Anomaly detection

We suspect that not all **alternatives** have the above structure, let's see if there is something weird

In [21]:
multi_al_results = [res for res in results if len(res['alternatives']) != 1]
multi_al_results

[]

In [24]:
res_alternatives = [res['alternatives'][0] for res in results]
len(res_alternatives)

17

In [26]:
for alt in res_alternatives:
    if 'transcript' not in alt.keys():
        print("Alternative doesn't have `transcript` key")
        print("No. of words: ", len(alt['words']))

Alternative doesn't have `transcript` key
No. of words:  1283


## Conclusion

After the above exploratory analysis, we conclude that:
* Each JSON transcript file hold a single dictionary that contains a single key `results`
* The `results` is a list of dictionary that contains a single key `alternatives` which in tern is a list of object
* Each `alternatives` contains a list of dictionary, dubbed `alternative`
* There are **two** types of `alternative`:
  - Type 1: a dictionary containing **three** keys: `transcript`, `confidence`, and `words`
  - Type 2: a dictionary containing **only one** key `words` that hold all the word tokens in the transcript of a podcast

In [17]:
count = 0
# for (dirpath, dirname, filenames) in os.walk(os.path.join(DATA_DIR, 'podcasts-transcripts')):
#     print(dirname, filenames)
#     count += 1
#     if count > 100:
#         break
json_files = []
PODCAST_DIR = os.path.join(DATA_DIR, 'podcasts-transcripts')
for seg_name in os.listdir(PODCAST_DIR):  # Segments of transcript dataset
    seg_path = os.path.join(PODCAST_DIR, seg_name)
    for alpha_subdir in os.listdir(seg_path):  # alphabet subdirectory
        alpha_subdir_path = os.path.join(seg_path, alpha_subdir)
        for show_dir in os.listdir(alpha_subdir_path):  # show directory
            if show_dir != 'show_4jJdofaAzXkKpsFJ8wGS9I':
                continue
            show_path = os.path.join(alpha_subdir_path, show_dir)
            for fname in os.listdir(show_path):
                if not fname.endswith(".json"):
                    continue
                json_files.append(os.path.join(seg_name, alpha_subdir, show_dir, fname))

print("Number of json file", len(json_files))

# with open(os.path.join(DATA_DIR, "json_file_list.txt"), "w") as f:
#     for json_file in json_files:
#         f.write(json_file + "\n")

Number of json file 67


In [18]:
json_files

['4/J/show_4jJdofaAzXkKpsFJ8wGS9I/2i074A63WWvH4Vc279IQiX.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/0vyLvCgyBjcLeVJTKc2cUl.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/24d2AKEjyjZnsZBBo76h9U.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/0sM9AoMdEN0Xgiy5tToPrx.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/49qeJLT70g693viogUK4d0.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/2KMSZ7uDnIpEywvssOuvez.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/3oKCx45xRsHjzwbpKlIJd3.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/7jUE1DJ6xyxLBPMzSeLW5W.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/3MEENb9JXqVd1Htm2tSNA2.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/6SqpthX1D3e1CXrCGgpSwi.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/2SifBnEafsZhruD4mwKePQ.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/2TWUNERicg9bMnHyq8dSr7.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/4Qu1gLwPtCEuO6wWXXQ4CN.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/2qoAP1EQlZyUq31uBaFM5b.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/71QbhzQw3yF5sJaJQyjhMY.json',
 '4/J/show_4jJdofaAzXkKpsFJ8wGS9I/6AmYkJ