In [34]:
import time

In [123]:
from random import randint, seed

class SkipNode:
    """A node from a skip list"""    
    def __init__(self, height = 0, elem = None , key=None):
        self.key = key
        self.elem = elem
        self.next = [None]*height

class SkipList:

    def __init__(self):
        self.head = SkipNode()
        self.len = 0
        self.maxHeight = 0

    def __len__(self):
        return self.len

    def find(self, elem, update = None):
        if update == None:
            update = self.updateList(elem)
        if len(update) > 0:
            candidate = update[0].next[0]
            if candidate != None and candidate.elem == elem:
                return candidate
        return None
    
    def contains(self, elem, update = None):
        return self.find(elem, update) != None
    
    def randomHeight(self):
        height = 1
        while randint(1, 2)!= 1:
            height += 1
        return height

    def updateList(self, elem):
        update = [None]*self.maxHeight
        x = self.head
        for i in reversed(range(self.maxHeight)):
            while x.next[i] != None and x.next[i].elem < elem:
                x = x.next[i]
            update[i] = x
        return update
        
    def insert(self, elem, key):

        node = SkipNode(self.randomHeight(), elem, key)

        self.maxHeight = max(self.maxHeight, len(node.next))
        while len(self.head.next) < len(node.next):
            self.head.next.append(None)

        update = self.updateList(elem)            
        if self.find(elem, update) == None:
            for i in range(len(node.next)):
                node.next[i] = update[i].next[i]
                update[i].next[i] = node
            self.len += 1

    def remove(self, elem):

        update = self.updateList(elem)
        x = self.find(elem, update)
        if x != None:
            for i in reversed(range(len(x.next))):
                update[i].next[i] = x.next[i]
                if self.head.next[i] == None:
                    self.maxHeight -= 1
            self.len -= 1            
                
    def printList(self):
        for i in range(len(self.head.next)-1, -1, -1):
            x = self.head
            while x.next[i] != None:
                print (x.next[i].elem + '-',end="")
                x = x.next[i]
            print ('')
    
    def get_ALL(self):
        data = []
        data2 = []
        x = self.head
        while x.next[0] != None:
            data.append(x.next[0].elem)
            data2.append(x.next[0].key)
            x = x.next[0]
        return data,data2

    def find_full_one(self, elem, update = None):
        if update == None:
            pdate = self.updateList(elem)
        if len(update) > 0:
            candidate = update[0].next[0]
            if candidate != None and (candidate.elem.find(elem) == -1):
                return candidate 
        return None
    
    def updateList(self, elem):
        update = [None]*self.maxHeight
        x = self.head
        for i in reversed(range(self.maxHeight)):
            while x.next[i] != None and x.next[i].elem < elem:
                x = x.next[i]
            update[i] = x
        return update
    



In [124]:
url_list = ["https://www.youtube.com/watch?v=1R2vIsIPI38","https://www.youtube.com/watch?v=1R2vIsIPI38","https://www.youtube.com/watch?v=adex5YdEbOs","https://www.youtube.com/watch?v=vlCumuwLkEM","https://www.youtube.com/watch?v=d25EQaECNnU","https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s","https://www.youtube.com/watch?v=OE9mcx_iJrE"]

In [125]:
sSkipList = SkipList()

In [126]:
for index,url in enumerate (url_list):
    key = "{}".format(index)
    elem = url
    sSkipList.insert(elem,key,)
    print("{}-{}".format(key,elem))

0-https://www.youtube.com/watch?v=1R2vIsIPI38
1-https://www.youtube.com/watch?v=1R2vIsIPI38
2-https://www.youtube.com/watch?v=adex5YdEbOs
3-https://www.youtube.com/watch?v=vlCumuwLkEM
4-https://www.youtube.com/watch?v=d25EQaECNnU
5-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s
6-https://www.youtube.com/watch?v=OE9mcx_iJrE


In [127]:
sSkipList.printList()

https://www.youtube.com/watch?v=1R2vIsIPI38-https://www.youtube.com/watch?v=OE9mcx_iJrE-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s-
https://www.youtube.com/watch?v=1R2vIsIPI38-https://www.youtube.com/watch?v=OE9mcx_iJrE-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s-https://www.youtube.com/watch?v=adex5YdEbOs-https://www.youtube.com/watch?v=d25EQaECNnU-https://www.youtube.com/watch?v=vlCumuwLkEM-


In [128]:
sSkipList.get_ALL()

(['https://www.youtube.com/watch?v=1R2vIsIPI38',
  'https://www.youtube.com/watch?v=OE9mcx_iJrE',
  'https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s',
  'https://www.youtube.com/watch?v=adex5YdEbOs',
  'https://www.youtube.com/watch?v=d25EQaECNnU',
  'https://www.youtube.com/watch?v=vlCumuwLkEM'],
 ['0', '6', '5', '2', '4', '3'])

In [42]:
node = sSkipList.find('https://www.youtube.com/watch?v=1R2vIsIPI38')

In [43]:
type(node)

__main__.SkipNode

In [44]:
print(node.key)
print(node.elem)

0
https://www.youtube.com/watch?v=1R2vIsIPI38


In [45]:
print(type(node.elem))

<class 'str'>


In [46]:
node_1 = sSkipList.find_full_one("https://www.youtube.com/watch?v=1R2vIsIPI38")

TypeError: object of type 'NoneType' has no len()

In [47]:
"1230".find("4")

-1

In [100]:
type(node_1)

NoneType

In [88]:
print(node_1.key)
print(node_1.elem)

AttributeError: 'NoneType' object has no attribute 'key'

In [17]:
def get_dd():
    dd = {"@url:" : 0, "@title:" : 0, "@content:" : 0, "@viewCount:":0, "@res:":0, "@duration:":0 }
    return dd

In [18]:
def get_ddc():
    dd = {"@url:" : "", "@title:" : "", "@content:" : "", "@viewCount:":"", "@res:":"", "@duration:":"" ,"@":""}
    return dd

In [19]:
def get_bool(dd):
    for i in dd:
        if (dd[i] == 0):
            return False
    return True

In [43]:
data = []

In [44]:
line_text = ""
ticks = time.time()
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
with open('youtube_partaa',encoding='utf-8') as fin:
    flag = 0
    count = 0
    ddc = get_ddc()
    dd = get_dd()
    for line in fin:
        count+=1
#         if count == 100:
#             break
        
        if (get_bool(dd)):
#             print(count)
            data.append(ddc)
            ddc = get_ddc()
            dd = get_dd()
            continue
        else:
            if(line.find("@url:")!=-1):
#                 print("@url:")
                ddc["@url:"] = line
                dd["@url:"] = 1
                continue
            if(line.find("@title:")!=-1):
#                 print("@title:")
                ddc["@title:"] = line
                dd["@title:"] = 1
                continue
            if(line.find("@content:")!=-1):
#                 print("@content:")
                ddc["@content:"] = line
                dd["@content:"] = 1
                continue
            if(line.find("@viewCount:")!=-1):
#                 print("@viewCount:")
                ddc["@viewCount:"] = line
                dd["@viewCount:"] = 1
                continue
            if(line.find("@res:")!=-1):
#                 print("@res:")
                ddc["@res:"] = line
                dd["@res:"] = 1
                continue
            if(line.find("@duration:")!=-1):
#                 print("@duration:")
                ddc["@duration:"] = line
                dd["@duration:"] = 1
                flag = 1
                continue
ticks = time.time()
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))            

2019-05-06 02:06:08
2019-05-06 02:06:18


In [45]:
def write_file(file_name,data):
    with open(file_name,'a',encoding='utf-8') as f:
        for line in data:
            f.write(data[line])
        f.write("@\n")

In [48]:
print(count)
print(len(data))
# file_name = "DB_{}.txt".format(i)
# print(file_name)

6439529
885980


In [47]:
name_number = 0
file_name = "DB_{}.txt".format(name_number)
sSkipList = SkipList()
ticks = time.time()
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
for i in range(len(data)):
    if i%10000 == 0:
        name_number +=1
        file_name = "DB_{}.txt".format(name_number)
        print(file_name)
        ticks = time.time()
        print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) 
    write_file(file_name,data[i])
    sSkipList.insert(data[i]["@url:"],file_name)
ticks = time.time()
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

2019-05-06 02:07:29
DB_1.txt
2019-05-06 02:07:29
DB_2.txt
2019-05-06 02:08:55
DB_3.txt
2019-05-06 02:10:23
DB_4.txt
2019-05-06 02:11:51
DB_5.txt
2019-05-06 02:12:19
DB_6.txt
2019-05-06 02:13:44
DB_7.txt
2019-05-06 02:15:12
DB_8.txt
2019-05-06 02:16:55
DB_9.txt
2019-05-06 02:17:23
DB_10.txt
2019-05-06 02:18:49
DB_11.txt
2019-05-06 02:20:16
DB_12.txt
2019-05-06 02:21:56
DB_13.txt
2019-05-06 02:22:24
DB_14.txt
2019-05-06 02:24:06
DB_15.txt
2019-05-06 02:25:47
DB_16.txt
2019-05-06 02:27:12
DB_17.txt
2019-05-06 02:28:39
DB_18.txt
2019-05-06 02:30:07
DB_19.txt
2019-05-06 02:30:35
DB_20.txt
2019-05-06 02:32:03
DB_21.txt
2019-05-06 02:33:30
DB_22.txt
2019-05-06 02:35:43
DB_23.txt
2019-05-06 02:37:09
DB_24.txt
2019-05-06 02:39:49
DB_25.txt
2019-05-06 02:41:18
DB_26.txt
2019-05-06 02:41:47
DB_27.txt
2019-05-06 02:44:05
DB_28.txt
2019-05-06 02:44:33
DB_29.txt
2019-05-06 02:46:00
DB_30.txt
2019-05-06 02:46:27
DB_31.txt
2019-05-06 02:48:00
DB_32.txt
2019-05-06 02:49:43
DB_33.txt
2019-05-06 02:51:11

In [28]:
node = sSkipList.find('@url:https://www.youtube.com/watch?v=aSoi2XVTPrU\n')

In [29]:
type(node)

__main__.SkipNode

In [30]:
node.key

'DB_177.txt'

In [206]:
testSkipList = SkipList()
for index,url in enumerate (url_list):
    key = "{}".format(index)
    elem = url
    testSkipList.insert(key,elem)
    print("{}-{}".format(key,elem))

0-https://www.youtube.com/watch?v=1R2vIsIPI38
1-https://www.youtube.com/watch?v=1R2vIsIPI38
2-https://www.youtube.com/watch?v=adex5YdEbOs
3-https://www.youtube.com/watch?v=vlCumuwLkEM
4-https://www.youtube.com/watch?v=d25EQaECNnU
5-https://www.youtube.com/watch?v=Qh8DoWbumgs&t=4s
6-https://www.youtube.com/watch?v=OE9mcx_iJrE


In [207]:
testSkipList.printList()

3-6-
1-3-6-
0-1-2-3-4-5-6-


In [31]:
import time

In [36]:
ticks = time.time()
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

2019-05-06 01:56:25


In [50]:
cnt = 0
for line in sSkipList:
    if cnt == 20:
        break
    print(line)
    cnt+=1

TypeError: 'SkipList' object is not iterable