In [1]:
import pandas as pd
import numpy as np

# Instagram user profile details
user_profile = pd.read_csv('https://se-data-test.s3.ca-central-1.amazonaws.com/user_details_v1.csv')

In [2]:
# grab some of the relevant columns
user_profile_clean = user_profile[['input.user_id', # match in comment dataset
                                   'user.username', # ig handle
                                   'user.biography', # services offered
                                   'user.external_url', # website
                                   'user.address_street', # city 
                                   'user.full_name', # pro name
                                   'user.category', # business type
                                   'user.city_name', # city
                                   'user.contact_phone_number', # phone number
                                   'user.public_email', # email
                                   'user.public_phone_country_code', # country
                                   'user.public_phone_number', # phone
                                   'user.zip', # zip postal
                                   'user.is_business', # business pro
                                   'user.is_potential_business', # business
                                   'user.account_type', # business pro?
                                   'user.latitude',
                                   'user.longitude'
                                   ]].copy()

In [3]:
# groupby username and collapse
user_profile_clean = user_profile_clean.groupby('user.username', as_index=False).agg(lambda x: ', '.join(set(x.astype(str))))

In [4]:
import re

# column with possible phone numbers extracted from the bio field
user_profile_clean['Phone Number 3'] = user_profile_clean['user.biography'].str.extract(
    "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})")

# column with possible emails extracted from the bio field
user_profile_clean['Email 2'] = user_profile_clean['user.biography'].str.findall(
    "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+")

# website
user_profile_clean['Website 1'] = user_profile_clean['user.biography'].str.extract('(https?://\S+)')

In [None]:
# given keyword extraction
business_keywords = ["hair salon", "nail salon", "spa", "barbershop"]
pro_keywords = ["hair stylist", "nail technician", "esthetician", "Barber"]
services_keywords = ["women's haircut", "men's haircut", "hair color", "hair extensions", "balayage", "highlights"]


user_profile_clean['Business Type'] = user_profile_clean.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1) \
                                                          .str.findall("(?i)(?=("+'|'.join(business_keywords)+"))") \
                                                          .fillna('')

user_profile_clean['Pro Type'] = user_profile_clean.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1) \
                                                          .str.findall("(?i)(?=("+'|'.join(pro_keywords)+"))") \
                                                          .fillna('')

user_profile_clean['Services'] = user_profile_clean.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1) \
                                                          .str.findall("(?i)(?=("+'|'.join(services_keywords)+"))") \
                                                          .fillna('')

In [None]:
# now lets look at the user comments

# Instagram user posts data: 
user_posts = pd.read_csv('https://se-data-test.s3.ca-central-1.amazonaws.com/user_posts_v1.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# grab some of the relevant columns
user_posts_clean = user_posts[['input.session_id',
                               'input.username',
                               'items.location.name',
                               'items.location.address',
                               'items.location.city',
                               'items.location.short_name',
                               'items.user.username',
                               'items.user.full_name',
                               'items.usertags.in[0].user.username',
                               'items.usertags.in[0].user.full_name',
                               'items.caption.text',
                               'items.caption.user.full_name',
                               'items.caption.user.username',
                               'items.preview_comments[0].text',
                               'items.preview_comments[0].user.username',
                               'items.preview_comments[0].user.full_name',
                               'items.comments[0].text',
                               'items.comments[0].user.username',
                               'items.comments[0].user.full_name',
                               'items.likers[0].username',
                               'items.likers[0].full_name',
                               'items.creative_config.effect_preview.name',
                               'items.creative_config.effect_preview.attribution_user.username',
                               'items.creative_config.effect_preview.gatekeeper',
                               'items.igtv_series_info.title',
                               'items.sponsor_tags[0].username',
                               'items.sponsor_tags[0].full_name'
                               ]].copy()

In [None]:
# groupby username and collapse
user_posts_clean = user_posts_clean.groupby('input.username', as_index=False).agg(lambda x: ', '.join(set(x.astype(str))))

In [None]:
# column with possible phone numbers extracted from the captions
user_posts_clean['Phones'] = user_posts_clean['items.caption.text'].str.extract("(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})")

user_posts_clean['Emails'] = user_posts_clean['items.caption.text'].str.extract("([a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+)")


In [None]:
# search all the fields for keywords
user_posts_clean['Business Type'] = user_posts_clean.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1) \
                                                          .str.findall("(?i)(?=("+'|'.join(business_keywords)+"))") \
                                                          .fillna('')

user_posts_clean['Pro Type'] = user_posts_clean.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1) \
                                                          .str.findall("(?i)(?=("+'|'.join(pro_keywords)+"))") \
                                                          .fillna('')

user_posts_clean['Services'] = user_posts_clean.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1) \
                                                          .str.findall("(?i)(?=("+'|'.join(services_keywords)+"))") \
                                                          .fillna('')

In [None]:
# outer merge the two datasets
users_merged = pd.merge(user_posts_clean, user_profile_clean, how='outer', left_on=['input.username'], right_on=['user.username'])

In [None]:
# adding and renaming columns

users_merged["Instagram Handle"] = users_merged["input.username"].fillna(users_merged["user.username"])
users_merged["Business Type"] = users_merged["Business Type_x"].fillna(users_merged["Business Type_y"])
users_merged["Pro Type"] = users_merged["Pro Type_x"].fillna(users_merged["Pro Type_y"])
users_merged["Business Name"] = users_merged["user.full_name"].fillna(users_merged["items.user.full_name"])
users_merged["Pro Name"] = users_merged["Business Name"]
users_merged["Services"] = users_merged["Services_x"].fillna(users_merged["Services_y"])

users_merged.rename({'user.is_business' : 'Business (T/F)',
                     'user.zip' : 'Zip / Postal',
                     'user.city_name' : 'City 1',
                     'user.public_email': 'Email 1',
                     'user.public_phone_number': 'Phone Number 1',
                     'user.contact_phone_number': 'Phone Number 2',
                     'user.external_url' : 'Website 3',
                     'user.address_street' : 'Street Address',
                     'user.latitude' : 'Latitude',
                     'user.longitude' : 'Longitude'
                    },
                  axis='columns', inplace=True)

In [None]:
# make a final copy

final = users_merged[['Instagram Handle',
                      'Business Type',
                      'Pro Type',
                      'Business Name',
                      'Pro Name',
                      'City 1',
                      'Services',
                      'Business (T/F)',
                      'Zip / Postal',
                      'Email 1',
                      'Email 2',
                      'Phone Number 1',
                      'Phone Number 2',
                      'Phone Number 3',
                      'Phones',
                      'Emails',
                      'Website 1',
                      'Website 3',
                      'Street Address',
                      'Latitude',
                      'Longitude']].copy()

# collect the phones
#final['Phone Number'] = final[["Phone Number 1", "Phone Number 3", "Phone Number 3"]].apply(lambda x: ','.join(x.fillna('').map(str)), axis=1)
final['Phone Number'] = final["Phone Number 1"].fillna(final["Phone Number 2"]).fillna(final["Phone Number 3"]).fillna(final["Phones"])

final['Website'] = final["Website 1"].fillna(final["Website 3"])

# collect emails
final['Email'] = final["Email 1"].fillna(final["Email 2"]).fillna(final["Emails"])
#final['Email'] = final[["Email 1", "Email 2"]].apply(lambda x: ','.join(x.fillna('').map(str)), axis=1)

# drop redundant
final.drop(['Phone Number 1', 'Phone Number 2', 'Phone Number 3', 'Phones', 
            'Email 1', 'Email 2', 'Emails', 'Website 1', 'Website 3'
            ], axis=1, inplace=True)


In [None]:
import numpy as np
final = final.replace('nan',np.NaN)
final.fillna('', inplace=True)

# remove duplicates
final['Services'] = final['Services'].apply(lambda x: list(set(x)))
final['Services'] = [','.join(map(str, l)) for l in final['Services']]

final['Business Type'] = final['Business Type'].apply(lambda x: list(set(x)))
final['Business Type'] = [','.join(map(str, l)) for l in final['Business Type']]

final['Pro Type'] = final['Pro Type'].apply(lambda x: list(set(x)))
final['Pro Type'] = [','.join(map(str, l)) for l in final['Pro Type']]

In [None]:
pip install reverse_geocoder



In [None]:
import reverse_geocoder as rg

coordinates = tuple(zip(final['Latitude'], final['Longitude']))

coord2 = []
for i in coordinates:
    if i[0] == '':
      coord2.append((0,0))
    elif i[0] != '':
      coord2.append(i)

coord2 = tuple(tuple(map(float, tup)) for tup in coord2) 
results = rg.search(coord2) # default mode = 2

In [None]:
locations = []
for d in results:
    if d['name'] != 'Takoradi':
        locations.append((d['cc'], d['name']))    # country city
    else:
        locations.append(('',''))


final[['Country','City 2']] = pd.DataFrame(locations)
final

Unnamed: 0,Instagram Handle,Business Type,Pro Type,Business Name,Pro Name,City 1,Services,Business (T/F),Zip / Postal,Street Address,Latitude,Longitude,Phone Number,Website,Email,Country,City 2
0,1.0,,,,,,balayage,,,,,,,,,,
1,hairbypetrina,,,Hair By Petrina,Hair By Petrina,,"balayage,highlights",,,,,,,,,,
2,hairbypeytonxx,spa,"hair stylist,barber",Lovely Locks By Peyton,Lovely Locks By Peyton,,"Highlights,BALAYAGE,highlights,hair color,bala...",,,,,,(810) 610-9453,,,,
3,hairbypinkyd,"Hair Salon,spa",,Pinky Denton,Pinky Denton,,"hair extensions,BALAYAGE,Highlights,highlights...",,,,,,904.285.1875,,,,
4,hairbypizer,"Hair Salon,Spa,spa",,Madison Pizer | DC Stylist,Madison Pizer | DC Stylist,,"Balayage,hair Color,highlights,hair color,bala...",,,,,,240.328.4396,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3741,zozan_guzellik_salonu,,,Zozangokgüzelliksalonu,Zozangokgüzelliksalonu,"Istanbul, Turkey",,True,,mevlana celalettin Rumi cad. başakşehir,41.09404,28.80843,,,,TR,Basaksehir
3742,zullyalburjas,,,,,,,False,,,,,,,,,
3743,zynpklc3434,,,zeynep Kiliç,zeynep Kiliç,,,False,,,,,,,,,
3744,zytka79,,,lucyna wojtanowicz,lucyna wojtanowicz,,,False,,,,,,,,,


In [None]:
final["City"] = final["City 1"].fillna(final["City 2"])

# drop redundant
final.drop(['City 1', 'City 2', 'Latitude', 'Longitude', 'Street Address'], axis=1, inplace=True)
final

Unnamed: 0,Instagram Handle,Business Type,Pro Type,Business Name,Pro Name,Services,Business (T/F),Zip / Postal,Phone Number,Website,Email,Country,City
0,1.0,,,,,balayage,,,,,,,
1,hairbypetrina,,,Hair By Petrina,Hair By Petrina,"balayage,highlights",,,,,,,
2,hairbypeytonxx,spa,"hair stylist,barber",Lovely Locks By Peyton,Lovely Locks By Peyton,"Highlights,BALAYAGE,highlights,hair color,bala...",,,(810) 610-9453,,,,
3,hairbypinkyd,"Hair Salon,spa",,Pinky Denton,Pinky Denton,"hair extensions,BALAYAGE,Highlights,highlights...",,,904.285.1875,,,,
4,hairbypizer,"Hair Salon,Spa,spa",,Madison Pizer | DC Stylist,Madison Pizer | DC Stylist,"Balayage,hair Color,highlights,hair color,bala...",,,240.328.4396,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3741,zozan_guzellik_salonu,,,Zozangokgüzelliksalonu,Zozangokgüzelliksalonu,,True,,,,,TR,"Istanbul, Turkey"
3742,zullyalburjas,,,,,,False,,,,,,
3743,zynpklc3434,,,zeynep Kiliç,zeynep Kiliç,,False,,,,,,
3744,zytka79,,,lucyna wojtanowicz,lucyna wojtanowicz,,False,,,,,,


In [None]:
# clean up duplicates
final['Services'] = final['Services'].str.lower().apply(lambda x: set(x.split(",")))
final['Services'] = [','.join(map(str, l)) for l in final['Services']]

final['Business Type'] = final['Business Type'].str.lower().apply(lambda x: set(x.split(",")))
final['Business Type'] = [','.join(map(str, l)) for l in final['Business Type']]

final['Pro Type'] = final['Pro Type'].str.lower().apply(lambda x: set(x.split(",")))
final['Pro Type'] = [','.join(map(str, l)) for l in final['Pro Type']]

final

Unnamed: 0,Instagram Handle,Business Type,Pro Type,Business Name,Pro Name,Services,Business (T/F),Zip / Postal,Phone Number,Website,Email,Country,City
0,1.0,,,,,balayage,,,,,,,
1,hairbypetrina,,,Hair By Petrina,Hair By Petrina,"balayage,highlights",,,,,,,
2,hairbypeytonxx,spa,"hair stylist,barber",Lovely Locks By Peyton,Lovely Locks By Peyton,"balayage,highlights,hair color",,,(810) 610-9453,,,,
3,hairbypinkyd,"spa,hair salon",,Pinky Denton,Pinky Denton,"hair extensions,highlights,men's haircut,hair ...",,,904.285.1875,,,,
4,hairbypizer,"spa,hair salon",,Madison Pizer | DC Stylist,Madison Pizer | DC Stylist,"balayage,highlights,hair color",,,240.328.4396,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3741,zozan_guzellik_salonu,,,Zozangokgüzelliksalonu,Zozangokgüzelliksalonu,,True,,,,,TR,"Istanbul, Turkey"
3742,zullyalburjas,,,,,,False,,,,,,
3743,zynpklc3434,,,zeynep Kiliç,zeynep Kiliç,,False,,,,,,
3744,zytka79,,,lucyna wojtanowicz,lucyna wojtanowicz,,False,,,,,,


In [None]:
#final.apply(lambda row: '' if row['Business (T/F)'] == 'True' else row)
final['Business (T/F)'] = final.apply(lambda x: 'True' if x['Business Type'] != '' else x['Business (T/F)'], axis=1)
final['Business Name'] = final.apply(lambda x: '' if x['Business (T/F)'] == 'False' else x['Business Name'], axis=1)
final['Pro Name'] = final.apply(lambda x: '' if x['Business (T/F)'] == 'True' else x['Pro Name'], axis=1)
final

Unnamed: 0,Instagram Handle,Business Type,Pro Type,Business Name,Pro Name,Services,Business (T/F),Zip / Postal,Phone Number,Website,Email,Country,City
0,1.0,,,,,balayage,,,,,,,
1,hairbypetrina,,,Hair By Petrina,Hair By Petrina,"balayage,highlights",,,,,,,
2,hairbypeytonxx,spa,"hair stylist,barber",Lovely Locks By Peyton,,"balayage,highlights,hair color",True,,(810) 610-9453,,,,
3,hairbypinkyd,"spa,hair salon",,Pinky Denton,,"hair extensions,highlights,men's haircut,hair ...",True,,904.285.1875,,,,
4,hairbypizer,"spa,hair salon",,Madison Pizer | DC Stylist,,"balayage,highlights,hair color",True,,240.328.4396,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3741,zozan_guzellik_salonu,,,Zozangokgüzelliksalonu,,,True,,,,,TR,"Istanbul, Turkey"
3742,zullyalburjas,,,,,,False,,,,,,
3743,zynpklc3434,,,,zeynep Kiliç,,False,,,,,,
3744,zytka79,,,,lucyna wojtanowicz,,False,,,,,,


In [None]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
final.to_csv('FINAL.csv')
!cp FINAL.csv "drive/My Drive/"