In [1]:
import pandas as pd

In [2]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [40]:
pd.options.display.max_colwidth = None

In [41]:
df_transcripts = pd.read_csv('transcripts.csv')

In [45]:
df_transcripts['row_num'] = df_transcripts.index

In [62]:
pysqldf("SELECT * FROM df_transcripts ORDER BY video_id, start").head(10)

Unnamed: 0,text,start,duration,video_id,row_num
0,"- So today's agenda,",8.11,1.583,4eUS8trd_yI,3378
1,we're gonna start with why am\nI talking China and Vietnam.,10.55,4.483,4eUS8trd_yI,3379
2,We're then gonna talk\nabout reform in China,16.39,2.75,4eUS8trd_yI,3380
3,"in the period leading up\nto the 4th of June, 1989",19.14,4.81,4eUS8trd_yI,3381
4,which is the Tiananmen Square massacre.,23.95,3.383,4eUS8trd_yI,3382
5,We'll then talk about\nTiananmen in the 1990s,28.24,3.74,4eUS8trd_yI,3383
6,that will lead us into a\ndiscussion of what I'm calling,32.84,3.02,4eUS8trd_yI,3384
7,the sequencing debate of\npolitical and economic reform.,35.86,4.72,4eUS8trd_yI,3385
8,Once it was clear,40.58,1.15,4eUS8trd_yI,3386
9,"that communism was gonna\nbe replaced by capitalism,",41.73,4.0,4eUS8trd_yI,3387


In [42]:
df_transcripts.columns

Index(['text', 'start', 'duration', 'video_id'], dtype='object')

In [66]:
## this query will merge the text from a set of sequential cells in each video_id.
## separated by a space
## The number of cells that will be squashed into a single cell is set by the 
## ranking - for instance, to combine every 10 cells, add 9 (to offset index at 1)
## and divide by 10 (keeping only the integer remainder)

df_merged_10 = pysqldf("""
WITH df_ranked AS

(SELECT
    *,
    (RANK () OVER ( 
        PARTITION BY video_id
        ORDER BY start ASC
    ) + 9)/ 10 RNK 
FROM
    df_transcripts
ORDER BY row_num
)

SELECT 
    GROUP_CONCAT(text, ' ') as text,
    MIN(start), 
    MAX(start), 
    SUM(duration), 
    video_id 
FROM df_ranked
GROUP BY
    rnk, video_id
ORDER BY video_id, MIN(start)
""")

In [67]:
df_merged_10.head()

Unnamed: 0,text,MIN(start),MAX(start),SUM(duration),video_id
0,"- So today's agenda, we're gonna start with why am\nI talking China and Vietnam. We're then gonna talk\nabout reform in China in the period leading up\nto the 4th of June, 1989 which is the Tiananmen Square massacre. We'll then talk about\nTiananmen in the 1990s that will lead us into a\ndiscussion of what I'm calling the sequencing debate of\npolitical and economic reform. Once it was clear that communism was gonna\nbe replaced by capitalism,",8.11,41.73,33.639,4eUS8trd_yI
1,"there was a huge debate about, well, is it better to have\npolitical reform first or economic reform first. We'll try to do them together and we'll talk about\nthat sequencing debate which will lead us into\na larger discussion of what since the 1950s has been known as modernization theory. The thesis that economic modernization will eventually produce demand for",45.73,69.72,26.45,4eUS8trd_yI
2,"and the establishment of democracy and that will then leave us\nto think about the future. So, China and Vietnam Today. - [Narrator] China so far\nhas built the equivalent of Europe's entire housing\nstock in just 15 years. In November 2015, Beijing\nreplaced the substantially larger 1300 ton Sanyuan Bridge in just 43 hours. Between 1996 and 2016, China has built 2.6 million miles of roads including 70,000 miles of highways",72.18,109.55,36.986,4eUS8trd_yI
3,"connecting 95% of the country's villages and overtaking the US as the country with the most extensive\nhighway system by almost 50%. Over the past decade, China has constructed the world's longest high speed rail network. 12,000 miles of rail lines that carry passengers between cities, at speeds up to 180 miles per hour. China now has more high speed rail tracks",112.28,135.52,25.74,4eUS8trd_yI
4,"than the rest of the world combined. - So that's one of any\nnumber of video clips one could pick to just give a snapshot of the incredible transformation\nof the Chinese economy over the last couple of decades and indeed, over the last decade, I went to Beijing last\nyear for the first time in about 12 years and 12 years ago, there were lots of\npotholes in the streets,",138.02,168.288,33.273,4eUS8trd_yI


In [68]:
df_merged_10.to_csv('transcripts_merged_10.csv')