In [1]:
import pandas as pd
import boto3

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import udf
from tqdm.notebook import tqdm
from pyspark.sql.types import *
from pyspark.sql.functions import struct
from pyspark.sql.functions import countDistinct
import os
import json
import re
import string

In [2]:
# load bucket
AVSLS_BUCKET = 'miba-ma-prj-aviasales'

with open('access.json') as file:
    access_data = json.load(file)

In [3]:
# load session
session = boto3.session.Session()
s3 = session.client(
    service_name='s3',
    aws_access_key_id=access_data['aws_access_key_id'],
    aws_secret_access_key=access_data['aws_secret_access_key'],
    endpoint_url='https://hb.bizmrg.com'
)

In [4]:
conf = SparkConf()
conf.set('spark.master', 'local[*]')
conf.set('spark.executor.memory', '16G')
conf.set('spark.driver.memory', '16G')
conf.set('spark.driver.maxResultSize', '16G')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
spark

In [5]:
spark._jsc.hadoopConfiguration().set('fs.s3a.access.key', access_data['aws_access_key_id'])
spark._jsc.hadoopConfiguration().set('fs.s3a.secret.key', access_data['aws_secret_access_key'])
spark._jsc.hadoopConfiguration().set('fs.s3a.impl','org.apache.hadoop.fs.s3a.S3AFileSystem')
spark._jsc.hadoopConfiguration().set('fs.s3a.multipart.size', '104857600')
spark._jsc.hadoopConfiguration().set('fs.s3a.block.size', '33554432')
spark._jsc.hadoopConfiguration().set('fs.s3a.threads.max', '256')
spark._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 'https://hb.bizmrg.com')

In [None]:
bucket = AVSLS_BUCKET
prefix = 'backlinks'
for obj in s3.list_objects_v2(Bucket=bucket, Prefix=prefix)['Contents']:
    print(obj['Key'] + ': ' + str(obj['Size']))

In [None]:
bucket = AVSLS_BUCKET
prefix = 'work/folder_3/'
for obj in s3.list_objects_v2(Bucket=bucket, Prefix=prefix)['Contents']:
    print(obj['Key'] + ': ' + str(obj['Size']))

In [None]:
bucket = AVSLS_BUCKET
prefix = 'backlinks'
summ = 0
for obj in s3.list_objects_v2(Bucket=bucket, Prefix=prefix)['Contents']:
    summ += obj['Size']
    
print(summ)

In [None]:
df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinkslarge/jdoqocy.com', sep=',', header=True)#.select('refdomain', 'url_to', 'total_backlinks')

In [None]:
df.limit(10).toPandas()

In [None]:
bucket = AVSLS_BUCKET
prefix = 'backlinks/'
for obj in s3.list_objects_v2(Bucket=bucket, Prefix=prefix)['Contents']:
    print(obj['Key'] + ': ' + str(obj['Size']))

In [None]:
list1 = ['2lka.net.csv', 'affilired.com.csv', 'cityadspix.com.csv', 'evyy.net.csv', 'hskwq.com.csv', 'maxbounty.com.csv.csv', 'nfemo.com.csv', 'ojrq.net.csv', 'pwieu.com.csv', 'sjv.io.csv']

In [None]:
list2 = ['ab-in-den-urlaub.de.csv', 'aegeanair.com.csv', 'aida.de.csv', 'airbnb.ae.csv', 'airbnb.at.csv', 'airbnb.be.csv', 'airbnb.cat.csv', 'airbnb.ch.csv', 'airbnb.cl.csv', 'airbnb.cn.csv', 'airbnb.co.id.csv', 'airbnb.co.in.csv', 'airbnb.co.kr.csv', 'airbnb.co.nz.csv', 'airbnb.co.uk.csv', 'airbnb.co.za.csv', 'airbnb.com.ar.csv', 'airbnb.com.au.csv', 'airbnb.com.br.csv', 'airbnb.com.co.csv', 'airbnb.com.mt.csv', 'airbnb.com.ro.csv', 'airbnb.com.sg.csv', 'airbnb.com.tr.csv', 'airbnb.com.tw.csv', 'airbnb.cz.csv', 'airbnb.de.csv', 'airbnb.dk.csv', 'airbnb.es.csv', 'airbnb.fi.csv', 'airbnb.fr.csv', 'airbnb.gr.csv', 'airbnb.hu.csv', 'airbnb.ie.csv', 'airbnb.it.csv', 'airbnb.jp.csv', 'airbnb.mx.csv', 'airbnb.nl.csv', 'airbnb.no.csv', 'airbnb.pl.csv', 'airbnb.pt.csv', 'airbnb.ru.csv', 'airbnb.se.csv', 'avs.io.csv', 'biletyplus.ru.csv', 'billiger-mietwagen.de.csv', 'blablacar.com.ua.csv', 'blablacar.ru.csv', 'bringfido.com.csv', 'britishairways.com.csv', 'busbud.com.csv', 'busfor.ru.csv', 'busfor.ua.csv', 'cashfree.com.csv', 'centraldereservas.com.csv', 'civitatis.com.csv', 'decolar.com.csv', 'despegar.com.mx.csv', 'directferries.com.csv', 'enterprise.co.uk.csv', 'enterprise.com.csv', 'enuygun.com.csv', 'expedia.ca.csv', 'expedia.de.csv', 'expedia.fr.csv', 'expedia.it.csv', 'extendedstayamerica.com.csv', 'ferryhopper.com.csv', 'firstchoice.co.uk.csv', 'hotelplanner.com.csv', 'hotelscombined.co.kr.csv', 'hotelscombined.com.tw.csv', 'hotelscombined.hk.csv', 'hotwire.com.csv', 'kayak.co.uk.csv', 'kayak.com.br.csv', 'kayak.com.csv', 'kayak.es.csv', 'kayak.fr.csv', 'kayak.it.csv', 'kiwi.com.csv', 'kkday.com.csv', 'logitravel.com.csv', 'mgmresorts.com.csv', 'minube.com.csv', 'momondo.com.csv', 'nationalexpress.com.csv', 'nocowanie.pl.csv', 'novasol.de.csv', 'omio.com.csv', 'oyorooms.com.csv', 'ozon.travel.csv', 'priceline.com.csv', 'qantas.com.csv', 'qatarairways.com.csv', 'redbus.in.csv', 'regiojet.cz.csv', 'rentalcars.com.csv', 'reservations.com.csv', 'ritzcarlton.com.csv', 'rixos.com.csv', 'roadtrippers.com.csv', 'rome2rio.com.csv', 'sixt.com.csv', 'sixt.de.csv', 'skyscanner.com.csv', 'skyscanner.com.tr.csv', 'skyscanner.de.csv', 'skyscanner.es.csv', 'skyscanner.fr.csv', 'skyscanner.it.csv', 'skyscanner.net.csv', 'skyscanner.nl.csv', 'skyscanner.pl.csv', 'skyscanner.ru.csv', 'sykescottages.co.uk.csv', 'traghettilines.it.csv', 'travelata.ru.csv', 'travelminit.ro.csv', 'traveloka.com.csv', 'traveltriangle.com.csv', 'trenes.com.csv', 'trip.com.csv', 'tui.co.uk.csv', 'tui.pl.csv', 'tvil.ru.csv', 'wanderu.com.csv', 'wego.com.csv', 'withairbnb.com.csv', 'wotif.com.csv', 'yatra.com.csv']

In [None]:
list3 = ['7eer.net', 'Tradedoubler.com', 'admitad.com', 'anrdoezrs.net', 'avantlink.com', 'awin1.com', 'click.linksynergy.com', 'dpbolvw.net', 'go.skimresources.com', 'jdoqocy.com', 'kqzyfj.com', 'prf.hn', 'pxf.io', 'shareasale.com', 'tc.tradetracker.net', 'tkqlhce.com', 'viglink.com']

In [None]:
list4 = ['agoda.com', 'airbnb.com', 'aviasales.ru', 'booking.com', 'getyourguide.com', 'hilton.com', 'hotelscombined.com', 'viator.com']

In [None]:
count = 0
for i in list4:
    df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinkslarge/{i}', sep=',', header=True)
    count += df.count()
    print(count)

print(f'Total: {count}')

In [None]:
df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinks/cashfree.com.csv', sep=',', header=True)#.select('refdomain', 'url_to', 'total_backlinks')

In [None]:
df.limit(10).toPandas()

In [None]:
df.coalesce(1).write \
    .mode('overwrite') \
    .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") \
    .option("header","true") \
    .csv(f'total_backlinks')

# Downloading brand-refdomain aggregates

In [None]:
df = spark.read.csv(f'shared/avs/dataset.csv', sep=',', header=True)
#df = spark.read.csv('s3a://miba-ma-prj-aviasales/work/folder_4/', sep=',', header=True)
df.limit(5).toPandas()

In [None]:
df.count()

In [None]:
df.select('brand').distinct().count()

In [None]:
brand_ref = df.select(['brand', 'refdomain']).distinct().groupBy('brand').count()

In [None]:
# adv_ref.toPandas().to_csv('refdomains.csv')

In [None]:
brand_ref.limit(10).toPandas()

In [None]:
brand_ref.select('brand').count()

In [None]:
brand_ref.coalesce(1).write \
    .mode('overwrite') \
    .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") \
    .option("header","true") \
    .csv(f'to_transfer/brand_ref')

In [None]:
adv_ref = df.select(['advertiser', 'refdomain']).distinct().groupBy('advertiser').count()

In [None]:
adv_ref.limit(10).toPandas()

In [None]:
adv_ref.coalesce(1).write \
    .mode('overwrite') \
    .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") \
    .option("header","true") \
    .csv(f'to_transfer/adv_ref')

# Downloading aggregates for finance

In [6]:
df = spark.read.csv(f'shared/avs/dataset.csv', sep=',', header=True)
df.limit(5).toPandas()

Unnamed: 0,advertiser,brand,refdomain,url_from,url_to,domain_rating,total_backlinks,refdomains,links_internal,links_external,linked_root_domains,last_visited,language,traffic,source
0,hotels.com,hotels,gocouponcodes.com,https://www.gocouponcodes.com/Hotels.com-a1702...,https://www.tkqlhce.com/click-3593945-10522625...,3,1,0,95,31,5,2020-10-30T06:35:29Z,English,0.0,cj.com
1,hotels.com,hotels,thehotelreservations.com,http://www.thehotelreservations.com/ski-vacati...,http://www.tkqlhce.com/click-1701608-10522625,2,1,0,63,8,5,2020-05-06T01:52:35Z,English,0.0,cj.com
2,hotels.com,hotels,toliveanddine.com,https://www.toliveanddine.com/2016/10/04/weekl...,http://www.jdoqocy.com/click-8070400-10522625-...,1,1,0,578,63,14,2020-10-26T01:21:37Z,English,0.0,cj.com
3,hotels.com,hotels,realfamilytrips.com,http://realfamilytrips.com/spotlight-winter-ho...,http://www.jdoqocy.com/click-7625323-10522625,6,1,0,52,37,15,2020-10-21T00:33:03Z,English,0.0,cj.com
4,hotels.com,hotels,vibafima.com,http://chi.vibafima.com/KUP/AFFIL_KUP_Hotelsco...,https://www.jdoqocy.com/click-1567021-10522625,13,1,1,2,91,6,2020-10-29T11:09:41Z,English,0.0,cj.com


In [12]:
df.count()

14554576

In [8]:
columns = ['refdomain', 'advertiser', 'total_backlinks', 'domain_rating', 'language']
# + unique backlinks

In [13]:
aggregate = df.select(columns).groupBy(['advertiser', 'refdomain', 'language']).sum('total_backlinks')

AnalysisException: "total_backlinks" is not a numeric column. Aggregation function can only be applied on a numeric column.;

In [14]:
aggregate = df.select(columns).groupBy(['advertiser', 'refdomain', 'language']).agg(sum("total_backlinks"))

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [None]:
aggregate = df.select(['advertiser', 'refdomain']).distinct().groupBy('advertiser').count()

## expedia 

In [None]:
columns = ['url_from','refdomain','ahrefs_rank','domain_rating','ahrefs_top','ip_from','links_internal','links_external','language','url_to','last_visited','refdomains','linked_root_domains','traffic','total_backlinks', 'advertiser']

In [None]:
df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinks/expedia.fr.csv', sep=',', header=True)
marker = 'affcid='

def find_aff(url_to_column, marker=marker):
    for j in url_to_column:
        if j is not None and marker in j.lower():
            return 1
        else:
            return 0

def advertiser(url_to_column):
    for j in url_to_column:
        return 'expedia.fr'

find_aff_func = F.udf(find_aff, StringType())
advertiser_func = F.udf(advertiser, StringType())


sdf = df.withColumn('is_aff', find_aff_func(struct([df['url_to'] for x in df.columns])))
sdf = sdf.where(sdf.is_aff == 1)
sdf = sdf.withColumn('advertiser', advertiser_func(struct([sdf['url_to'] for x in sdf.columns])))
sdf = sdf.select(columns)

sdf.toPandas()

In [None]:
sdf.toPandas().to_csv(f'to_transfer/expedia.fr', index=False)

# Aviasales source

In [8]:
df = spark.read.csv(f's3a://miba-ma-prj-aviasales/work/folder_3/aviasales.ru.csv', sep=',', header=True).toPandas()
df

Unnamed: 0,url_from,refdomain,ahrefs_rank,domain_rating,ahrefs_top,ip_from,links_internal,links_external,language,url_to,last_visited,refdomains,linked_root_domains,traffic,total_backlinks,advertiser
0,https://goodwidgets.ru/air-tickets/route/from-...,goodwidgets.ru,0,0,0,83.243.73.209,9,30,,https://search.aviasales.ru/?adults=1&children...,2020-10-17T22:28:46Z,0,5,0.0,13,aviasales.ru
1,https://garlandus.ru/air-tickets/route/from-So...,garlandus.ru,0,0,0,83.243.73.209,11,23,ru,https://search.aviasales.ru/?adults=1&children...,2020-10-23T10:57:09Z,0,5,0.0,3,aviasales.ru
2,https://aviasales-ticket.blogspot.com/2014/12/...,aviasales-ticket.blogspot.com,0,0,0,216.58.204.97,183,73,ru,http://engine.aviasales.ru/latest_prices?origi...,2020-10-14T12:21:39Z,0,14,0.0,2,aviasales.ru
3,https://airetic.ru/dieshievyie-aviabiliety-v-d...,airetic.ru,0,0,172628942,104.18.46.120,148,25,ru,https://www.aviasales.ru/search/MOW1611JED1612...,2020-10-30T12:21:52Z,0,5,0.0,1,aviasales.ru
4,https://airetic.ru/dieshievyie-biliety-iz-ghor...,airetic.ru,0,0,172628942,104.18.46.120,155,227,ru,https://www.aviasales.ru/search/OMS0511EVN1211...,2020-10-28T05:30:02Z,0,5,0.0,1,aviasales.ru
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678514,http://minskysoft.ru/,minskysoft.ru,53,37,1989146,62.122.170.171,186,29,ru,https://hotels.aviasales.ru/?marker=122233.430...,2020-10-22T15:38:23Z,828,12,0.0,1,aviasales.ru
678515,http://www.situation.ru/,situation.ru,53,31,3708372,62.122.170.171,186,38,ru,https://hotels.aviasales.ru/?marker=122233.435...,2020-11-02T04:01:41Z,901,15,0.0,1,aviasales.ru
678516,https://airlines.aero/,airlines.aero,54,37,1939024,31.31.199.97,39,8,ru,https://www.aviasales.ru/ios/?marker=248703,2020-10-18T19:36:32Z,887,7,1411.648527,2,aviasales.ru
678517,http://www.contr-tv.ru/,contr-tv.ru,56,28,4990376,62.122.170.171,186,38,ru,https://hotels.aviasales.ru/?marker=122233.200...,2020-11-02T00:10:00Z,1180,15,0.0,1,aviasales.ru


In [None]:
df['source']

# Marker search

In [None]:
def find_arguments(row):
    m = re.search("\/([^\/]+)\/?$", str(row))
    if m is not None:
        m = m.group()
        if '?' in m:
            m = re.search("\?(.*?$)", m).group(1)
            split =  m.split('&')
            for i in split:
                if '=' in i:
                    arg = re.search('(.*)=', i)
                    if arg != None:
                        f.append(arg.group(1))
                        
                             
                        
def find_marker_links(df, examples=3):

    markers = pd.DataFrame()

    for i in range(len(m)):
        data = []
        
        for j in range(len(df)):
            if df.url_to[j] is not None and (f'&{m.index[i]}' in df.url_to[j] or f'?{m.index[i]}' in df.url_to[j]):
                data.append(df.url_to[j])
            else:
                continue

        data = data[:examples]
        keys = [m.index[i]]*examples
        num = [m.iloc[i][0]]*examples

        markers = markers.append(pd.DataFrame({'key':keys, 'num':num, 'link': data})).reset_index(drop = True)
        
    return markers

In [None]:
lst = ['ab-in-den-urlaub.de.csv',
'aegeanair.com.csv',
'cashfree.com.csv',
'centraldereservas.com.csv',
'expedia.it.csv',
'extendedstayamerica.com.csv',
'hotwire.com.csv',
'nationalexpress.com.csv',
'nocowanie.pl.csv',
'omio.com.csv',
'oyorooms.com.csv',
'redbus.in.csv',
'regiojet.cz.csv',
'rentalcars.com.csv',
'reservations.com.csv',
'rome2rio.com.csv',
'sykescottages.co.uk.csv',
'traveltriangle.com.csv',
'trenes.com.csv',
'tui.co.uk.csv',
'tui.pl.csv',
'tvil.ru.csv'
]

In [None]:
lst2 = ['viator.com','hilton.com','getyourguide.com']

In [None]:
for dataset in lst:
    try:
        df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinks/{dataset}', sep=",", header=True).limit(1000000).toPandas()

        f = []

        for i in df.url_to:
            find_arguments(i)

        args = pd.DataFrame(columns=['arg'], data=f)

        m = args['arg'].value_counts()
        m = pd.DataFrame(columns=['arg'], data = m)
        m = m.head(15)

        markers = find_marker_links(df, 3)
        markers.to_csv(f'markers/{dataset}', index=False)
    
    except:
        pass

In [None]:
for dataset in lst2:
    try:
        df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinkslarge/{dataset}', sep=",", header=True).limit(1000000).toPandas()

        f = []

        for i in df.url_to:
            find_arguments(i)

        args = pd.DataFrame(columns=['arg'], data=f)

        m = args['arg'].value_counts()
        m = pd.DataFrame(columns=['arg'], data = m)
        m = m.head(15)

        markers = find_marker_links(df, 3)
        markers.to_csv(f'markers/{dataset}', index=False)
    
    except:
        pass

In [None]:
name = 'getyourguide.com'
df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinkslarge/{name}', sep=",", header=True)

In [None]:
df = df.limit(500000).toPandas()

In [None]:
f = []

for i in df.url_to:
    find_arguments(i)

args = pd.DataFrame(columns=['arg'], data=f)

m = args['arg'].value_counts()
m = pd.DataFrame(columns=['arg'], data = m)
m = m.head(15)
m

In [None]:
markers = find_marker_links(df, 3)

In [None]:
markers

In [None]:
markers.to_csv(f'markers/getyourguide.com', index=False)

In [None]:
markers.link.iloc[5]

In [None]:
test = pd.DataFrame()
for i in range(len(df)):
    if df.url_to[i] is not None and 'cid=' in df.url_to[i]:
        test = test.append(df.iloc[i])

In [None]:
test[['refdomain', 'url_to']].sort_values('refdomain', ascending=False).iloc[50:100]

# Subdomain search

In [None]:
pip install tldextract

In [None]:
import tldextract

In [None]:
extract = tldextract.extract('https://www.getyourguide.com/camino-de-santiag')
domain = "{}.{}.{}".format(extract.subdomain, extract.domain, extract.suffix)
domain

In [None]:
df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinks/{dataset}', sep=",", header=True).select('url_to').limit(1000000).toPandas().url_to
domains = []

for i in range(len(df)):
    if df.iloc[i] is not None:
        extract = tldextract.extract(df.iloc[i])
        domain = "{}.{}.{}".format(extract.subdomain, extract.domain, extract.suffix)
        domains.append(domain)
    else:
        continue

In [None]:
domains

In [None]:
pd.DataFrame(data=domains, columns=['domain']).groupby(['domain']).size().reset_index(name='counts').sort_values('counts', ascending=False)

In [None]:
for dataset in lst2:
    
    df = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinkslarge/{dataset}', sep=",", header=True).select('url_to').limit(1000000).toPandas().url_to

    domains = []

    for i in range(len(df)):
        if df.iloc[i] is not None:
            extract = tldextract.extract(df.iloc[i])
            domain = "{}.{}.{}".format(extract.subdomain, extract.domain, extract.suffix)
            domains.append(domain)
        else:
            continue
        
    domains = pd.DataFrame(data=domains, columns=['domain']).groupby(['domain']).size().reset_index(name='counts').sort_values('counts', ascending=False)

    domains.to_csv(f'markers/{dataset}', index=False)

# qeeq.com uk

In [None]:
df = spark.read.csv('s3a://miba-ma-prj-aviasales/work/folder_4/tradetracker.net.csv', sep=',', header=True)

In [None]:
df.count()

In [None]:
df.where(df.advertiser == 'qeeq.com uk').limit(5).toPandas()

In [None]:
def fix_adv(col):
    for i in col:
        if i == 'qeeq.com uk':
            return 'qeeq.com'
        else:
            return i
        
fix_adv_func = F.udf(fix_adv, StringType())

sdf = df.withColumn('advertiser', fix_adv_func(struct([df['advertiser'] for x in df.columns])))

sdf.limit(5).toPandas()

In [None]:
def fix_brand(col):
    for i in col:
        if i == 'com uk':
            return 'qeeq'
        else:
            return i
        
fix_brand_func = F.udf(fix_brand, StringType())

sdf = sdf.withColumn('brand', fix_brand_func(struct([sdf['brand'] for x in sdf.columns])))

sdf.limit(5).toPandas()

In [None]:
sdf.where(df.advertiser == 'qeeq.com uk').limit(5).toPandas()

In [None]:
sdf.count()

In [None]:
sdf.coalesce(1).write \
    .mode('overwrite') \
    .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") \
    .option("header","true") \
    .csv(f'to_transfer/tradetracker.net')

In [None]:
path_file_upload = f'to_transfer/tradetracker.net.csv'
path_file_s3 = f'work/folder_4/tradetracker.net.csv'
s3.upload_file(path_file_upload, AVSLS_BUCKET, path_file_s3)

# Count datasets

In [9]:
list1 = ['2lka.net.csv','affilired.com.csv','cityadspix.com.csv','evyy.net.csv','hskwq.com.csv','nfemo.com.csv','ojrq.net.csv','pwieu.com.csv','sjv.io.csv']

In [15]:
total = 0
for i in list1:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinks/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

2lka.net.csv 362
affilired.com.csv 1217
cityadspix.com.csv 33585
evyy.net.csv 472034
hskwq.com.csv 18580
nfemo.com.csv 20957
ojrq.net.csv 57162
pwieu.com.csv 7084
sjv.io.csv 421725

total 1032706


In [19]:
list2 = ['7eer.net','Tradedoubler.com','admitad.com','anrdoezrs.net','avantlink.com','awin1.com','click.linksynergy.com','dpbolvw.net','go.skimresources.com','jdoqocy.com','kqzyfj.com','prf.hn','pxf.io','shareasale.com','tc.tradetracker.net','tkqlhce.com','viglink.com','travelpayouts.com.csv','tp.media.csv']

In [18]:
total = 0
for i in list2:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinkslarge/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

7eer.net 4756617
Tradedoubler.com 6082248
admitad.com 5277285
anrdoezrs.net 5824070
avantlink.com 1674449
awin1.com 6520070
click.linksynergy.com 8726239
dpbolvw.net 4905781
go.skimresources.com 1673027
jdoqocy.com 6035107
kqzyfj.com 5954637
prf.hn 4727094
pxf.io 1229563
shareasale.com 8049325
tc.tradetracker.net 3612063
tkqlhce.com 6385463
viglink.com 7318497
travelpayouts.com.csv 256738
tp.media.csv 98316

total 89106589


In [21]:
list3 = ['ab-in-den-urlaub.de.csv','aida.de.csv','airbnb.ae.csv','airbnb.at.csv','airbnb.be.csv','airbnb.cat.csv','airbnb.ch.csv','airbnb.cl.csv','airbnb.cn.csv','airbnb.co.id.csv','airbnb.co.in.csv','airbnb.co.kr.csv','airbnb.co.nz.csv','airbnb.co.uk.csv','airbnb.co.za.csv','airbnb.com.ar.csv','airbnb.com.au.csv','airbnb.com.br.csv','airbnb.com.co.csv','airbnb.com.mt.csv','airbnb.com.ro.csv','airbnb.com.sg.csv','airbnb.com.tr.csv','airbnb.com.tw.csv','airbnb.cz.csv','airbnb.de.csv','airbnb.dk.csv','airbnb.es.csv','airbnb.fi.csv','airbnb.fr.csv','airbnb.gr.csv','airbnb.hu.csv','airbnb.ie.csv','airbnb.it.csv','airbnb.jp.csv','airbnb.mx.csv','airbnb.nl.csv','airbnb.no.csv','airbnb.pl.csv','airbnb.pt.csv','airbnb.ru.csv','airbnb.se.csv','avs.io.csv','biletyplus.ru.csv','billiger-mietwagen.de.csv','blablacar.com.ua.csv','blablacar.ru.csv','bringfido.com.csv','britishairways.com.csv','busbud.com.csv','busfor.ru.csv','busfor.ua.csv','civitatis.com.csv','decolar.com.csv','despegar.com.mx.csv','directferries.com.csv','enterprise.co.uk.csv','enterprise.com.csv','enuygun.com.csv','expedia.ca.csv','expedia.de.csv','expedia.fr.csv','expedia.it.csv','ferryhopper.com.csv','firstchoice.co.uk.csv','hotelplanner.com.csv','hotelscombined.co.kr.csv','hotelscombined.com.tw.csv','hotelscombined.hk.csv','kayak.co.uk.csv','kayak.com.br.csv','kayak.com.csv','kayak.es.csv','kayak.fr.csv','kayak.it.csv','kiwi.com.csv','kkday.com.csv','logitravel.com.csv','mgmresorts.com.csv','minube.com.csv','momondo.com.csv','novasol.de.csv','omio.com.csv','oyorooms.com.csv','ozon.travel.csv','priceline.com.csv','qantas.com.csv','qatarairways.com.csv','regiojet.cz.csv','rentalcars.com.csv','ritzcarlton.com.csv','rixos.com.csv','sixt.com.csv','sixt.de.csv','skyscanner.com.csv','skyscanner.com.tr.csv','skyscanner.de.csv','skyscanner.es.csv','skyscanner.fr.csv','skyscanner.it.csv','skyscanner.net.csv','skyscanner.nl.csv','skyscanner.pl.csv','skyscanner.ru.csv','traghettilines.it.csv','travelata.ru.csv','travelminit.ro.csv','traveloka.com.csv','trip.com.csv','tui.co.uk.csv','tui.pl.csv','wanderu.com.csv','wego.com.csv','withairbnb.com.csv','wotif.com.csv','yatra.com.csv']

In [22]:
total = 0
for i in list3:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinks/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

ab-in-den-urlaub.de.csv 29839
aida.de.csv 35897
airbnb.ae.csv 938
airbnb.at.csv 3197
airbnb.be.csv 5792
airbnb.cat.csv 869
airbnb.ch.csv 5448
airbnb.cl.csv 1539
airbnb.cn.csv 7655
airbnb.co.id.csv 1693
airbnb.co.in.csv 13667
airbnb.co.kr.csv 3409
airbnb.co.nz.csv 7618
airbnb.co.uk.csv 80410
airbnb.co.za.csv 1298
airbnb.com.ar.csv 2983
airbnb.com.au.csv 42510
airbnb.com.br.csv 22191
airbnb.com.co.csv 1671
airbnb.com.mt.csv 1585
airbnb.com.ro.csv 1519
airbnb.com.sg.csv 9431
airbnb.com.tr.csv 4273
airbnb.com.tw.csv 10360
airbnb.cz.csv 7354
airbnb.de.csv 35959
airbnb.dk.csv 5041
airbnb.es.csv 34734
airbnb.fi.csv 2874
airbnb.fr.csv 58788
airbnb.gr.csv 4604
airbnb.hu.csv 2701
airbnb.ie.csv 12606
airbnb.it.csv 27663
airbnb.jp.csv 26858
airbnb.mx.csv 6363
airbnb.nl.csv 20125
airbnb.no.csv 4368
airbnb.pl.csv 9614
airbnb.pt.csv 6500
airbnb.ru.csv 22585
airbnb.se.csv 6361
avs.io.csv 9538
biletyplus.ru.csv 21608
billiger-mietwagen.de.csv 11216
blablacar.com.ua.csv 1262
blablacar.ru.csv 3549
bringf

In [23]:
list4 = ['agoda.com','airbnb.com','aviasales.ru','booking.com','getyourguide.com','hotelscombined.com','viator.com']

In [24]:
total = 0
for i in list4:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/backlinkslarge/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

agoda.com 6236629
airbnb.com 4125356
aviasales.ru 807512
booking.com 6916214
getyourguide.com 1584104
hotelscombined.com 1388280
viator.com 1270285

total 22328380


In [25]:
list5 = ['awin1.com.csv','click.linksynergy.com.csv','shareasale.com.csv','tc.tradetracker.net.csv','admitad.com.csv','tkqlhce.com.csv','jdoqocy.com.csv','anrdoezrs.net.csv','dpbolvw.net.csv','kqzyfj.com.csv','tradedoubler.com.csv','travelpayouts.com.csv','tp.media.csv']

In [26]:
total = 0
for i in list5:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/work/folder_1/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

awin1.com.csv 6520070
click.linksynergy.com.csv 6610605
shareasale.com.csv 184505
tc.tradetracker.net.csv 3582139
admitad.com.csv 5277285
tkqlhce.com.csv 6385463
jdoqocy.com.csv 6035107
anrdoezrs.net.csv 5824070
dpbolvw.net.csv 4905781
kqzyfj.com.csv 5954637
tradedoubler.com.csv 6082248
travelpayouts.com.csv 236531
tp.media.csv 98252

total 57696693


In [27]:
list6 = ['awin1.com.csv','cj.com.csv','tradetracker.net.csv','click.linksynergy.com.csv','shareasale.com.csv','tradedoubler.com.csv','travelpayouts.com.csv','tradedoubler.com(mixed travel).csv']

In [28]:
total = 0
for i in list6:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/work/folder_2/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

awin1.com.csv 337256
cj.com.csv 673963
tradetracker.net.csv 584642
click.linksynergy.com.csv 9371
shareasale.com.csv 182063
tradedoubler.com.csv 36888
travelpayouts.com.csv 467122
tradedoubler.com(mixed travel).csv 2968721

total 5260026


In [30]:
list7 = ['prf.hn.csv','viglink.com.csv','go.skimresources.com.csv','affilired.com.csv','impact.com.csv','tc.tradetracker.net(mixed travel).csv','click.linksynergy.com(mixed travel).csv','shareasale(mixed travel).csv']

In [31]:
total = 0
for i in list7:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/work/folder_2/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

prf.hn.csv 4142492
viglink.com.csv 7167527
go.skimresources.com.csv 1669587
affilired.com.csv 400
impact.com.csv 6932159
tc.tradetracker.net(mixed travel).csv 2866523
click.linksynergy.com(mixed travel).csv 2085611
shareasale(mixed travel).csv 1523157

total 26387456


In [32]:
list8 = ['avantlink.com.csv','cityads.com.csv','admitad.com.csv']

In [33]:
total = 0
for i in list8:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/work/folder_2/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

avantlink.com.csv 611896
cityads.com.csv 10061
admitad.com.csv 5775727

total 6397684


In [40]:
list9 = ['agoda.com.csv','airbnb.com.csv','aviasales.ru.csv','booking.com.csv','getyourguide.com.csv','hotelscombined.com.csv','viator.com.csv','ab-in-den-urlaub.de.csv','aida.de.csv','airbnb.ae.csv','airbnb.at.csv','airbnb.be.csv','airbnb.cat.csv','airbnb.ch.csv','airbnb.cl.csv','airbnb.cn.csv','airbnb.co.id.csv','airbnb.co.in.csv','airbnb.co.kr.csv','airbnb.co.nz.csv','airbnb.co.uk.csv','airbnb.co.za.csv','airbnb.com.ar.csv','airbnb.com.au.csv','airbnb.com.br.csv','airbnb.com.co.csv','airbnb.com.mt.csv','airbnb.com.ro.csv','airbnb.com.sg.csv','airbnb.com.tr.csv','airbnb.com.tw.csv','airbnb.cz.csv','airbnb.de.csv','airbnb.dk.csv','airbnb.es.csv','airbnb.fi.csv','airbnb.fr.csv','airbnb.gr.csv','airbnb.hu.csv','airbnb.ie.csv','airbnb.it.csv','airbnb.jp.csv','airbnb.mx.csv','airbnb.nl.csv','airbnb.no.csv','airbnb.pl.csv','airbnb.pt.csv','airbnb.ru.csv','airbnb.se.csv','avs.io.csv','biletyplus.ru.csv','billiger-mietwagen.de.csv','blablacar.com.ua.csv','blablacar.ru.csv','bringfido.com.csv','britishairways.com.csv','busbud.com.csv','busfor.ru.csv','busfor.ua.csv','civitatis.com.csv','decolar.com.csv','despegar.com.mx.csv','directferries.com.csv','enterprise.co.uk.csv','enterprise.com.csv','enuygun.com.csv','expedia.ca.csv','expedia.de.csv','expedia.fr.csv','expedia.it.csv','ferryhopper.com.csv','firstchoice.co.uk.csv','hotelplanner.com.csv','hotelscombined.co.kr.csv','hotelscombined.com.tw.csv','hotelscombined.hk.csv','kayak.co.uk.csv','kayak.com.br.csv','kayak.com.csv','kayak.es.csv','kayak.fr.csv','kayak.it.csv','kiwi.com.csv','kkday.com.csv','logitravel.com.csv','mgmresorts.com.csv','minube.com.csv','momondo.com.csv','novasol.de.csv','omio.com.csv','oyorooms.com.csv','ozon.travel.csv','priceline.com.csv','qantas.com.csv','qatarairways.com.csv','regiojet.cz.csv','rentalcars.com.csv','ritzcarlton.com.csv','rixos.com.csv','sixt.com.csv','sixt.de.csv','skyscanner.com.csv','skyscanner.com.tr.csv','skyscanner.de.csv','skyscanner.es.csv','skyscanner.fr.csv','skyscanner.it.csv','skyscanner.net.csv','skyscanner.nl.csv','skyscanner.pl.csv','skyscanner.ru.csv','traghettilines.it.csv','travelata.ru.csv','travelminit.ro.csv','traveloka.com.csv','trip.com.csv','tui.co.uk.csv','tui.pl.csv','wanderu.com.csv','wego.com.csv','withairbnb.com.csv','wotif.com.csv','yatra.com.csv']

In [41]:
total = 0
for i in list9:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/work/folder_3/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

agoda.com.csv 2879356
airbnb.com.csv 32910
aviasales.ru.csv 678519
booking.com.csv 5207700
getyourguide.com.csv 351515
hotelscombined.com.csv 920809
viator.com.csv 532680
ab-in-den-urlaub.de.csv 12
aida.de.csv 250
airbnb.ae.csv 2
airbnb.at.csv 49
airbnb.be.csv 220
airbnb.cat.csv 1
airbnb.ch.csv 111
airbnb.cl.csv 20
airbnb.cn.csv 49
airbnb.co.id.csv 13
airbnb.co.in.csv 287
airbnb.co.kr.csv 33
airbnb.co.nz.csv 216
airbnb.co.uk.csv 3620
airbnb.co.za.csv 20
airbnb.com.ar.csv 53
airbnb.com.au.csv 1815
airbnb.com.br.csv 1872
airbnb.com.co.csv 25
airbnb.com.mt.csv 6
airbnb.com.ro.csv 4
airbnb.com.sg.csv 50
airbnb.com.tr.csv 399
airbnb.com.tw.csv 222
airbnb.cz.csv 29
airbnb.de.csv 674
airbnb.dk.csv 40
airbnb.es.csv 696
airbnb.fi.csv 24
airbnb.fr.csv 2566
airbnb.gr.csv 58
airbnb.hu.csv 58
airbnb.ie.csv 694
airbnb.it.csv 551
airbnb.jp.csv 612
airbnb.mx.csv 138
airbnb.nl.csv 444
airbnb.no.csv 34
airbnb.pl.csv 114
airbnb.pt.csv 70
airbnb.ru.csv 411
airbnb.se.csv 31
avs.io.csv 0
biletyplus.ru.csv 1

In [42]:
list10 = ['direct_advertisers.csv','awin1.com.csv','cj.com.csv','travelpayouts.com.csv','tradedoubler.com.csv','tradetracker.net.csv','click.linksynergy.com.csv','shareasale.com.csv','prf.hn.csv','viglink.com.csv','go.skimresources.com.csv','affilired.com.csv','impact.com.csv','avantlink.com.csv','cityads.com.csv','admitad.com.csv']

In [43]:
total = 0
for i in list10:
    count = spark.read.csv(f's3a://miba-ma-prj-aviasales/work/folder_4/{i}', sep=',', header=True).count()
    print(f'{i} {count}')
    total += count
print('')
print(f'total {total}')

direct_advertisers.csv 11210328
awin1.com.csv 337256
cj.com.csv 673963
travelpayouts.com.csv 467122
tradedoubler.com.csv 452400
tradetracker.net.csv 619041
click.linksynergy.com.csv 34725
shareasale.com.csv 182915
prf.hn.csv 192208
viglink.com.csv 58420
go.skimresources.com.csv 43380
affilired.com.csv 304
impact.com.csv 257386
avantlink.com.csv 2643
cityads.com.csv 180
admitad.com.csv 82998

total 14615269
