In [1]:
from random import randint, seed

class SkipNode:
    """A node from a skip list"""    
    def __init__(self, height = 0, elem = None , key=None):
        self.key = key
        self.elem = elem
        self.next = [None]*height

class SkipList:

    def __init__(self):
        self.head = SkipNode()
        self.len = 0
        self.maxHeight = 0

    def __len__(self):
        return self.len

    def find(self, elem, update = None):
        if update == None:
            update = self.updateList(elem)
        if len(update) > 0:
            candidate = update[0].next[0]
            if candidate != None and candidate.elem == elem:
                return candidate
        return None
    
    def contains(self, elem, update = None):
        return self.find(elem, update) != None
    
    def randomHeight(self):
        height = 1
        while randint(1, 2)!= 1:
            height += 1
        return height

    def updateList(self, elem):
        update = [None]*self.maxHeight
        x = self.head
        for i in reversed(range(self.maxHeight)):
            while x.next[i] != None and x.next[i].elem < elem:
                x = x.next[i]
            update[i] = x
        return update
        
    def insert(self, elem, key):

        node = SkipNode(self.randomHeight(), elem, key)

        self.maxHeight = max(self.maxHeight, len(node.next))
        while len(self.head.next) < len(node.next):
            self.head.next.append(None)

        update = self.updateList(elem)            
        if self.find(elem, update) == None:
            for i in range(len(node.next)):
                node.next[i] = update[i].next[i]
                update[i].next[i] = node
            self.len += 1

    def remove(self, elem):

        update = self.updateList(elem)
        x = self.find(elem, update)
        if x != None:
            for i in reversed(range(len(x.next))):
                update[i].next[i] = x.next[i]
                if self.head.next[i] == None:
                    self.maxHeight -= 1
            self.len -= 1            
                
    def printList(self):
        for i in range(len(self.head.next)-1, -1, -1):
            x = self.head
            while x.next[i] != None:
                print (x.next[i].elem + '-',end="")
                x = x.next[i]
            print ('')


In [3]:
url_list = ["https://www.youtube.com/watch?v=1R2vIsIPI38","https://www.youtube.com/watch?v=1R2vIsIPI38","https://www.youtube.com/watch?v=adex5YdEbOs","https://www.youtube.com/watch?v=vlCumuwLkEM","https://www.youtube.com/watch?v=d25EQaECNnU","https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s","https://www.youtube.com/watch?v=OE9mcx_iJrE"]

In [4]:
sSkipList = SkipList()

In [5]:
for index,url in enumerate (url_list):
    key = "{}".format(index)
    elem = url
    sSkipList.insert(elem,key)
    print("{}-{}".format(key,elem))

0-https://www.youtube.com/watch?v=1R2vIsIPI38
1-https://www.youtube.com/watch?v=1R2vIsIPI38
2-https://www.youtube.com/watch?v=adex5YdEbOs
3-https://www.youtube.com/watch?v=vlCumuwLkEM
4-https://www.youtube.com/watch?v=d25EQaECNnU
5-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s
6-https://www.youtube.com/watch?v=OE9mcx_iJrE


In [6]:
sSkipList.printList()

https://www.youtube.com/watch?v=d25EQaECNnU-
https://www.youtube.com/watch?v=d25EQaECNnU-
https://www.youtube.com/watch?v=OE9mcx_iJrE-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s-https://www.youtube.com/watch?v=d25EQaECNnU-
https://www.youtube.com/watch?v=OE9mcx_iJrE-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s-https://www.youtube.com/watch?v=adex5YdEbOs-https://www.youtube.com/watch?v=d25EQaECNnU-
https://www.youtube.com/watch?v=1R2vIsIPI38-https://www.youtube.com/watch?v=OE9mcx_iJrE-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s-https://www.youtube.com/watch?v=adex5YdEbOs-https://www.youtube.com/watch?v=d25EQaECNnU-https://www.youtube.com/watch?v=vlCumuwLkEM-


In [7]:
node = sSkipList.find('https://www.youtube.com/watch?v=OE9mcx_iJrE')

In [8]:
type(node)

__main__.SkipNode

In [9]:
node.key

'6'

In [53]:
def get_dd():
    dd = {"@url:" : 0, "@title:" : 0, "@content:" : 0, "@viewCount:":0, "@res:":0, "@duration:":0 }
    return dd

In [54]:
def get_ddc():
    dd = {"@url:" : "", "@title:" : "", "@content:" : "", "@viewCount:":"", "@res:":"", "@duration:":"" ,"@":""}
    return dd

In [39]:
def get_bool(dd):
    for i in dd:
        if (dd[i] == 0):
            return False
    return True

In [90]:
data = []

In [91]:
line_text = ""
with open('youtube_partaa',encoding='utf-8') as fin:
    flag = 0
    count = 0
    ddc = get_ddc()
    dd = get_dd()
    for line in fin:
        count+=1
#         if count == 100:
#             break
        
        if (get_bool(dd)):
#             print(count)
            data.append(ddc)
            ddc = get_ddc()
            dd = get_dd()
            continue
        else:
            if(line.find("@url:")!=-1):
#                 print("@url:")
                ddc["@url:"] = line
                dd["@url:"] = 1
                continue
            if(line.find("@title:")!=-1):
#                 print("@title:")
                ddc["@title:"] = line
                dd["@title:"] = 1
                continue
            if(line.find("@content:")!=-1):
#                 print("@content:")
                ddc["@content:"] = line
                dd["@content:"] = 1
                continue
            if(line.find("@viewCount:")!=-1):
#                 print("@viewCount:")
                ddc["@viewCount:"] = line
                dd["@viewCount:"] = 1
                continue
            if(line.find("@res:")!=-1):
#                 print("@res:")
                ddc["@res:"] = line
                dd["@res:"] = 1
                continue
            if(line.find("@duration:")!=-1):
#                 print("@duration:")
                ddc["@duration:"] = line
                dd["@duration:"] = 1
                flag = 1
                continue
            

In [113]:
def write_file(file_name,data):
    with open(file_name,'a',encoding='utf-8') as f:
        for line in data:
            f.write(data[line])
        f.write("@\n")

In [127]:
print(count)
print(len(data))
file_name = "DB_{}.txt".format(i)
print(file_name)

6439529
885980
DB_10000.txt


In [135]:
name_number = 0
file_name = "DB_{}.txt".format(name_number)
sSkipList = SkipList()
for i in range(len(data)):
    if i%5000 == 0:
        name_number +=1
        file_name = "DB_{}.txt".format(name_number)
        print(file_name)
    write_file(file_name,data[i])
    sSkipList.insert(data[i]["@url:"],file_name)

DB_1.txt
DB_2.txt
DB_3.txt
DB_4.txt
DB_5.txt
DB_6.txt
DB_7.txt
DB_8.txt
DB_9.txt
DB_10.txt
DB_11.txt
DB_12.txt
DB_13.txt
DB_14.txt
DB_15.txt
DB_16.txt
DB_17.txt
DB_18.txt
DB_19.txt
DB_20.txt
DB_21.txt
DB_22.txt
DB_23.txt
DB_24.txt
DB_25.txt
DB_26.txt
DB_27.txt
DB_28.txt
DB_29.txt
DB_30.txt
DB_31.txt
DB_32.txt
DB_33.txt
DB_34.txt
DB_35.txt
DB_36.txt
DB_37.txt
DB_38.txt
DB_39.txt
DB_40.txt
DB_41.txt
DB_42.txt
DB_43.txt
DB_44.txt
DB_45.txt
DB_46.txt
DB_47.txt
DB_48.txt
DB_49.txt
DB_50.txt
DB_51.txt
DB_52.txt
DB_53.txt
DB_54.txt
DB_55.txt
DB_56.txt
DB_57.txt
DB_58.txt
DB_59.txt
DB_60.txt
DB_61.txt
DB_62.txt
DB_63.txt
DB_64.txt
DB_65.txt
DB_66.txt
DB_67.txt
DB_68.txt
DB_69.txt
DB_70.txt
DB_71.txt
DB_72.txt
DB_73.txt
DB_74.txt
DB_75.txt
DB_76.txt
DB_77.txt
DB_78.txt
DB_79.txt
DB_80.txt
DB_81.txt
DB_82.txt
DB_83.txt
DB_84.txt
DB_85.txt
DB_86.txt
DB_87.txt
DB_88.txt
DB_89.txt
DB_90.txt
DB_91.txt
DB_92.txt
DB_93.txt
DB_94.txt
DB_95.txt
DB_96.txt
DB_97.txt
DB_98.txt
DB_99.txt
DB_100.txt
DB_101.t

KeyboardInterrupt: 

In [145]:
node = sSkipList.find('@url:https://www.youtube.com/watch?v=Y66p2iy7OuI\n')

In [146]:
type(node)

__main__.SkipNode

In [147]:
node.key

'DB_44.txt'

In [206]:
testSkipList = SkipList()
for index,url in enumerate (url_list):
    key = "{}".format(index)
    elem = url
    testSkipList.insert(key,elem)
    print("{}-{}".format(key,elem))

0-https://www.youtube.com/watch?v=1R2vIsIPI38
1-https://www.youtube.com/watch?v=1R2vIsIPI38
2-https://www.youtube.com/watch?v=adex5YdEbOs
3-https://www.youtube.com/watch?v=vlCumuwLkEM
4-https://www.youtube.com/watch?v=d25EQaECNnU
5-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s
6-https://www.youtube.com/watch?v=OE9mcx_iJrE


In [207]:
testSkipList.printList()

3-6-
1-3-6-
0-1-2-3-4-5-6-
