In [1]:
import os
import re
import csv
import sys
import nltk
# import pprint
import sqlite3
import pandas as pd
# import matplotlib.pyplot as plt

input_file = 'USvideos.csv'
pos_file = 'parts_of_speech.csv'

with open(input_file, 'r') as youtube_data:
    masterdata_csv = list(csv.reader(youtube_data))

with open(pos_file, 'r') as parts_of_speech:
    pos_load = csv.reader(parts_of_speech)
    pos_dict = {}
    for row in pos_load:
        pos_dict[row[0]] = row[1]

def open_db(database):
    db = sqlite3.connect(database)
    db.execute('PRAGMA foreign_keys = ON;')
    return db

#### Making an effort to decorate

In [2]:
def sql_decorator(func):
    
    def sql_action(statement, opt_args = None):
        db = None
        try:
            db = open_db('youtube.db')
            c = db.cursor()
            f = func(c, statement, opt_args)
            if f:
                return f
                #hackey attempt to result of the function call just for fetchall
            db.commit()

        except OSError as err:
            print("OS error: {}".format(err))

        finally:
            db.rollback()
            db.close()
            

    return sql_action

@sql_decorator
def run_sql(c, statement, opt_args):
    if opt_args == None:
        opt_args = []
    c.execute(statement, opt_args)

@sql_decorator
def run_sql_many(c, statement, opt_args):
    c.executemany(statement, opt_args)

@sql_decorator
def sql_fetchall(c, statement, opt_args):
    if opt_args == None:
        opt_args = []
    c.execute(statement, opt_args)
    return c.fetchall()
    

#### Create database and master table

In [3]:
try : 
    os.remove('youtube.db')
except FileNotFoundError:
    pass

In [4]:
create_statement_master = '''
    CREATE TABLE tblMasterData (
    id INTEGER AUTOIMCREMENT PRIMARY KEY,
    video_id TEXT,
    trending_date TEXT,
    title TEXT,
    channel_title TEXT,
    category_id INTEGER,
    publish_time INTEGER,
    tags TEXT,
    views INTEGER,
    likes INTEGER,
    dislikes INTEGER,
    comment_count INTEGER,
    thumbnail_link TEXT,
    comments_disabled TEXT,
    ratings_disabled TEXT,
    video_error_or_removed TEXT,
    description TEXT)
    '''

run_sql(create_statement_master)

#### Save column headers into a dictionary and delete column header row

In [5]:
headers = masterdata_csv[0]
headers_dict = {}

for count, value in enumerate(headers, 1):
    headers_dict[value] = count -1
# to keep from 0 indexing my columns

del masterdata_csv[0]

#### Load data into master table

In [6]:
load_statement_master = '''
    INSERT INTO tblMasterData
    (video_id,
    trending_date,
    title,
    channel_title,
    category_id,
    publish_time,
    tags,
    views,
    likes,
    dislikes,
    comment_count,
    thumbnail_link,
    comments_disabled,
    ratings_disabled,
    video_error_or_removed,
    description)
    VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
    '''

source = masterdata_csv

run_sql_many(load_statement_master, source)

#### After reviewing master data, create two tables based on function -- one static, one transactional

In [7]:
create_statement_video = '''
    CREATE TABLE tblVideos (
    video_id TEXT PRIMARY KEY NOT NULL,
    title TEXT NOT NULL,
    channel_title TEXT NOT NULL,
    publish_time INTEGER NOT NULL,
    tags TEXT NOT NULL,
    thumbnail_link TEXT NOT NULL,
    comments_disabled TEXT NOT NULL,
    ratings_disabled TEXT NOT NULL,
    video_error_or_removed TEXT NOT NULL,
    description TEXT NOT NULL)
    '''

run_sql(create_statement_video)

create_statement_time = '''
    CREATE TABLE tblTime (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    video_id TEXT NOT NULL,
    trending_date TEXT NOT NULL,
    views INTEGER NOT NULL,
    likes INTEGER NOT NULL,
    dislikes INTEGER NOT NULL,
    comment_count INTEGER NOT NULL,
        FOREIGN KEY(video_id) REFERENCES tblVideos(video_id))
    '''

run_sql(create_statement_time)

#### Date is stored in hard-to-read format. Transform date column.

In [8]:
for row in masterdata_csv:
    date = row[headers_dict['trending_date']]
    updated_date = "20" + date[:2]
    updated_date += "-"
    updated_date += date[6:]
    updated_date += "-"
    updated_date += date[3:5]
    row[headers_dict['trending_date']] = updated_date

#### Titles contain characters + and &. These will not be parseable. Replacing with 'and'.

In [9]:
for row in masterdata_csv:
    title = row[headers_dict['title']]
    updated_title = re.sub('&|\+', 'and', title)
    row[headers_dict['title']] = updated_title

#### Separate CSV file into lists to be loaded to each table

In [10]:
video_data = []
time_data = []

for row in masterdata_csv:
    video_entry = [row[headers_dict['video_id']], 
                   row[headers_dict['title']], 
                   row[headers_dict['channel_title']], 
                   row[headers_dict['publish_time']],
                   row[headers_dict['tags']],
                   row[headers_dict['thumbnail_link']],
                   row[headers_dict['comments_disabled']],
                   row[headers_dict['ratings_disabled']],
                   row[headers_dict['video_error_or_removed']],
                   row[headers_dict['description']]
                  ]
    
    video_data.append(video_entry)
    
    time_entry = [row[headers_dict['video_id']], 
                  row[headers_dict['trending_date']], 
                  row[headers_dict['views']], 
                  row[headers_dict['likes']],
                  row[headers_dict['dislikes']],
                  row[headers_dict['comment_count']]
                 ]
    
    time_data.append(time_entry)

#### Load data to tables

In [11]:
load_statement_videos = '''
    INSERT OR REPLACE INTO tblVideos
    (video_id,
    title,
    channel_title,
    publish_time,
    tags,
    thumbnail_link,
    comments_disabled,
    ratings_disabled,
    video_error_or_removed,
    description)
    VALUES (?,?,?,?,?,?,?,?,?,?)
    '''

run_sql_many(load_statement_videos, video_data)


In [12]:
load_statement_time = '''
    INSERT INTO tblTime
    (video_id,
    trending_date,
    views,
    likes,
    dislikes,
    comment_count)
    VALUES (?,?,?,?,?,?)
    '''

run_sql_many(load_statement_time, time_data)

#### Test for foreign key failure, uncomment and run for evidence  

In [13]:
# failure_statement = "INSERT INTO tblTime VALUES (?, ?, ?, ?, ?, ?, ?)"
# failure_params = (None, "testy", "18.11.11", 1, 1, 1, 1)

# run_sql(failure_statement, failure_params)

#### Titles appear to come in multiple segments, divided by special characters. Create Segments table to store segments of each title for analysis

In [14]:
create_statement_segments = '''
    CREATE TABLE tblSegments (
    segment_id INTEGER PRIMARY KEY AUTOINCREMENT,
    video_id TEXT NOT NULL,
    segment_text TEXT NOT NULL,
    segment_structure TEXT,
        FOREIGN KEY(video_id) REFERENCES tblVideos(video_id))
    '''

run_sql(create_statement_segments)

#### Define classes to facilitate analaysis...and for practice

In [15]:
class Video:
    
    def __init__(self, video_id, title):
        self.video_id = video_id
        self.title = title
    
    def longest_word(self):
        longest_length = 0
        longest_word = ''
        for word in self.title:
            if len(word) > longest_length:
                longest_length = len(word)
                longest_word = word
            else:
                continue
        return longest_word

    def title_segments(self):
        list = re.compile("(?:\||(?:\s-\s)|—|:|\(|\)|\[|\]|{|})+").split(self.title)
        # ?: indicates a non-capture group so delimiters aren't saved. Now I know.
        list = filter(lambda x: x != None, list)
        #  python is returning None where the delimiter was. Removing Nones.
        segments = []
        for text in list:
            text = text.strip()
            if text == "":
                continue
            segments.append(Segment(self.video_id, text))
        return segments

    
class Segment:
    
    def __init__(self, video_id, text):
        self.video_id = video_id
        self._text = text
        
    def text(self):
        lower_words = self._text.lower()
        return re.sub('[^A-Za-z0-9\s\-\']+', '', lower_words)
    
    def words(self):
        list = self.text().split()
        list = filter(lambda x: x != None, list)
        #  python is returning None where the delimiter was. Removing Nones.
        segments = []
        for item in list:
            text = item.strip()
            if text == "":
                continue
            segments.append(text)
        return segments
    
    def parts_of_speech(self):
        text = nltk.word_tokenize(self.text())
        tagged_text = nltk.pos_tag(text)
        #  creates list of tuples with (word, part of speech)
        return [item[1] for item in tagged_text]

class Title_glob:
    
    def __init__(self, glob):
        self.glob = glob
    
    def word_list(self):
        return self.glob
    
    def frequency_distribution(self):
        return nltk.FreqDist(self.glob)
    
    def lexical_diversity(self):
        return len(set(self.glob)) / len(self.glob)

#### Select data from Videos table to parse into segments, generate parts of speech for each, and commit to Segments table

In [26]:
titles_list = sql_fetchall("SELECT video_id, title FROM tblVideos;")

[('9wRQljFNDW8', "Dion Lewis' 103-Yd Kick Return TD vs. Denver! | Can't-Miss Play | NFL Wk 10 Highlights"), ('Om_zGhJLZ5U', 'TL;DW - Every DCEU Movie Before Justice League'), ('goP4Z5wyOlM', 'Iraq-Iran earthquake: Deadly tremor hits border region - BBC News'), ('8NHA23f7LvU', 'Jason Momoa Wows Hugh Grant With Some Dothraki | The Graham Norton Show'), ('IE-xepGLVt8', "Mayo Clinic's first face transplant patient meets donor’s family")]


In [17]:
segment_entries=[]

for row in titles_list:
    video = Video(row[0], row[1])
    segments = video.title_segments()
    for segment in segments:
        segment_entry = []
        segment_entry.append(segment.video_id)
        segment_entry.append(segment.text())
        segment_entry.append(", ".join(segment.parts_of_speech()))
        segment_entries.append(segment_entry)

In [18]:
insert_statement_segments = '''
        INSERT INTO tblSegments
        (video_id,
        segment_text,
        segment_structure)
        VALUES (?,?, ?)
        '''
source = segment_entries

run_sql_many(insert_statement_segments, source)

# select_segments = run_query('SELECT * FROM tblSegments WHERE segment_id>=(abs(random()) % (SELECT max(segment_id)FROM tblSegments)) LIMIT 5')

# select_segments

It doesn't know that all nouns aren't proper...bit of a bummer

In [19]:
segments_list = sql_fetchall("SELECT video_id, segment_text, segment_id FROM tblSegments;")

#### Concatenate all segments for analysis as a whole

In [20]:
def lower_case(list):
    return [w.lower() for w in list]

all_segment_words = []

for item in segments_list:
    segment = Segment(item[0], item[1])
    segment_words = segment.words()
    for word in segment_words: 
        all_segment_words.append(word)

all_segment_words = lower_case(all_segment_words)

all_words = Title_glob(sorted(all_segment_words))
# print(all_words.word_list())

all_tokens = Title_glob(sorted(set(all_segment_words)))
# print(all_tokens.word_list())

#### The X most common words

In [21]:
freq_dist = all_words.frequency_distribution()

print(freq_dist.most_common(30))

[('the', 1627), ('and', 753), ('a', 748), ('to', 685), ('in', 538), ('of', 531), ('official', 486), ('with', 470), ('on', 405), ('2018', 361), ('video', 317), ('for', 285), ('i', 285), ('trailer', 276), ('how', 274), ('is', 267), ('my', 257), ('you', 240), ('from', 201), ('at', 194), ('vs', 178), ('2017', 175), ('new', 170), ('ft', 155), ('live', 144), ('what', 142), ('first', 141), ('2', 139), ('hd', 139), ('this', 139)]


In [22]:
print("""
        The lexical diversity of the corpus is {}. 
        This is generated by dividing the length of the set of unique words over the length of  the set of all words."""
        .format(all_words.lexical_diversity()))



        The lexical diversity of the corpus is 0.21445316284025961. 
        This is generated by dividing the length of the set of unique words over the length of  the set of all words.


In [23]:
longest_word = ''
longest_length = 0

for word in all_tokens.word_list():
    if len(word) > longest_length:
        longest_word = word
        longest_length = len(word)

print("The longest word in the corpus is {} with a length of {} characters".format(longest_word, longest_length))

The longest word in the corpus is brfxxccxxmnpcccclllmmnprxvclmnckssqlbb11116 with a length of 43 characters


In [24]:
segments_count = sql_fetchall("""
            SELECT count(segment_id) as number_of_segments, video_id 
            FROM tblSegments 
            GROUP BY video_id 
            ORDER BY number_of_segments desc
            LIMIT 6;
            """)

segments_count

[(6, '07JQ4WZJIbg'),
 (6, 'IxF3mxWbdjw'),
 (6, 'JWH5KE1atAg'),
 (6, 'XiHiW4N7-bo'),
 (6, 'Yq4_YocuVeg'),
 (6, 'Zjp0mdMeIPU')]

In [25]:
df = sql_fetchall("""
            SELECT count(segment_id) as number_of_segments, video_id 
            FROM tblSegments 
            GROUP BY video_id;
            """)

print(df.mean())

AttributeError: 'list' object has no attribute 'mean'

The mean number of segments per title is 1.855141

In [None]:
df = sql_fetchall('''
            SELECT count(video_id) as number_of_pattern_occurances, segment_structure
            FROM tblSegments 
            GROUP BY segment_structure
            ORDER BY number_of_pattern_occurances DESC
            LIMIT 10;
            ''')
print(df)

In [None]:

repeated = """
        SELECT count(video_id) as number_of_pattern_occurances
        FROM tblSegments 
        GROUP BY segment_structure
        HAVING number_of_pattern_occurances > 1
        ORDER BY number_of_pattern_occurances DESC;
        """

every = """
    SELECT count(video_id) as number_of_pattern_occurances
    FROM tblSegments 
    GROUP BY segment_structure
    ORDER BY number_of_pattern_occurances DESC;
    """

df_repeated = sql_fetchall(repeated)
df_all = sql_fetchall(every)

print( """
        The number of repeated grammatical patterns is {}.
        There are a total of {} unique grammatical patterns found in title segments."""
        .format(len(df_repeated.index), len(df_all.index)))
