# Imports

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from fa2 import ForceAtlas2
import numpy as np
import csv
import networkx as nx
import os
#couchsurfing API is external API locally saved also available in our github: https://github.com/eikekutz/SocialGraphFinalProject
from couchsurfing import Api
from random import randint
from time import sleep
from geopy import geocoders
import unidecode
import conda
#Necessary Settings for the conda package to work with the Basemap
conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib

from mpl_toolkits.basemap import Basemap


 * **0\. Data Extraction**
    * 0.1\. Couchsurfing API Data Gathering
        * 0.1.1\. Get Hosts Data
        * 0.1.2\. Get Reviews and Reviewers Data
    * 0.2\. Data Merging
    * 0.3\. Adding Geo Location of Users
    * 0.4\. Cleaning Data
    
* 1\. Project's objective (**fix list problem**)
    * 1.1\. What do we want to answer in this project
    * 1.2\. Datasets limitations
    
* **2\. Preliminary Data Analysis**
    * 2.1\. User
        * 2.1.1\. User Analysis
        * 2.1.2\. User Statistics
        * 2.1.3\. User Graphs
    * 2.2\. City
        * 2.2.1\. City Clustering
        * 2.2.2\. City Analysis
        * 2.2.3\. City Statistics
        * 2.2.4\. City Graphs

* **3\. Network Construction**
    * 3.1\.User 
        * 3.1.1\. Initial Analysis
        * 3.1.2\. Alternative Construction
    * 3.2\.City 
        * 3.2.1\. Initial Analysis
        * 3.2.2\. Alternative Construction

* **4\. Basic Network Analysis**
    * 4.1\. Degree Distribution
    * 4.2\. Power-laws and Friendship Paradox
    * 4.3\. Centrality
    * 4.4\. Assortativity
    * 4.5\. Modularity and Communities
    * 4.6\. Network Visualizations and Statistics

* **5\. Analysis of Review data**
    * 5.1\. Wordclouds
    * 5.2\. Happiness Averages
        * 5.2.1\. Users
        * 5.2.2\. Cities

* **6\. Analysis of specific Users**
    * 6.1\. Combined sentiment
        * 6.1.1\.  Image Sentiment
        * 6.1.1\.  Profile description Sentiment
    * 6.2\. Best/Worst User
        * 6.2.1\. Best/Worst Couchsurfer
        * 6.2.1\. Best Host

* **7\. Discussion**

# 0.Data Extraction#

In [2]:
def getHostFromCity(city,latitude,longitude):
    '''
    getting all the host data within a circle with a radius of 25km 
    max 20 pages with each 100 users
    '''
    print("Getting results for ",city,latitude,longitude)
    for p in range(1,20):
        print(p)
        ##hosts = api.get_hosts(city,25,100,p)
        ##hosts = api.get_hosts(city,25,100,None,'best_match',"yes,maybe",None,p,last_login)
        hosts = api.get_hosts_latlong(latitude,longitude,p,100,25,4)
        writeCityResults2File(hosts,city,p)
        sleep(randint(2,5))

In [3]:
def writeCityResults2File(results,cityString,page):
    '''
    writing hosts to csv file 100 users for each file
    '''
    with open('data/Top50/hosts/'+cityString+str(page)+'.csv','w',newline="") as f:
        writer = csv.writer(f,delimiter = ',')
        # Write CSV Header, If you dont need that, remove this line
        writer.writerow(["id",
                "publicName",
                "avatarUrl",
                "isVerified",
                "status",
                "lastLogin",
                "aboutText",
                "responseRate",
                "responseTimeText",
                'responseRateText',
                "totalReferencesCount",
                "profileLink",
                "friendsCount",
                "languages",
                "city"])

        for host in results['results']:
            writer.writerow([host["id"],
                    host["publicName"],
                    host["avatarUrl"],
                    host["isVerified"],
                    host["status"],
                    host["lastLogin"],
                    host["aboutText"],
                    host["responseRate"],
                    host["responseTimeText"],
                    host["responseRateText"],
                    host["totalReferencesCount"],
                    host["profileLink"],
                    host["friendsCount"],
                    host["languages"],
                    cityString])

In [5]:
#import nodes to get id from nodes/hosts
def importHosts(index):
    '''
    Import merged hosts to get id, which is used to download the reviews for each user
    '''
    hosts = pd.DataFrame() #creates a new dataframe that's empty
    city = cities.loc[index]
    for p in range(1,15):
        if (os.path.exists('data/Top50/hosts/'+city['name']+str(p)+'.csv')):
            df = pd.read_csv('data/Top50/hosts/'+city['name']+str(p)+'.csv')
            hosts = pd.concat([hosts,df]).reset_index(drop=True)
    return hosts

In [6]:
def createRevCityList(city): 
    '''
    Create a csv file for the given city to store reviews of the city
    '''
    with open('data/Top50/reviews/'+city+'Rev.csv','w',newline="") as f:
            writer = csv.writer(f,delimiter = ',')
            writer.writerow(["id",
                    "text",
                    "createdDate",
                    "experience",
                    "relationshipType",
                    "isPostTrip",
                    "to",
                    "from",
                    "fromPublicName",
                    "fromPublicAddressId",
                    "fromPublicAddressDescription",
                    "fromAvatarUrl",
                    "fromStatus",
                    "fromIsVerified",
                    "fromIsDeleted",
                    "fromBlockedBy",
                    "inverseReference",
                    "response"])

In [7]:
def appendReview(file,city):
    '''
    Append Reviews of the user on the specific city reviews csv file
    '''
    with open('data/Top50/reviews/'+city+'Rev.csv','a',newline="") as f:
        writer = csv.writer(f,delimiter = ',')
        for host in file['results']:
            writer.writerow([host["id"],
                host["text"],
                host["createdDate"],
                host["experience"],
                host["relationshipType"],
                host["isPostTrip"],
                host["to"]['id'],
                host["from"]['id'],
                host["from"]["publicName"],
                host["from"]["publicAddress"]["id"],
                host["from"]["publicAddress"]["description"],
                host["from"]["avatarUrl"],
                host["from"]["status"],
                host["from"]["isVerified"],
                host["from"]["isDeleted"],
                host["from"]["blockedBy"],
                host["inverseReference"],
                host["response"]])

In [9]:
def getReferenceForCity(index):
    '''
    Get the reviews for every hosts for a given city based on the downloaded host dataset
    '''
    hosts= importHosts(index=index)
    city = cities.loc[index]
    print("Getting results for ",city['name'])
    createRevCityList(city['name'])
    for idx,host in hosts.iterrows():
        test=api.get_references(uid=host['id'],type='host')
        appendReview(test,city['name'])
        test=api.get_references(uid=host['id'],type='surf')
        appendReview(test,city['name'])     

## 0.1. Couchsurfing API Data Gathering

In [None]:
#Get all the hosts from a city , iterating over the city list
for idx,city in cities.iterrows():#.loc[cities['2017']>50000].iterrows():
    getHostFromCity(city['name'],city['lat'],city['lng'])
    sleep(randint(10,20))

In [None]:
#Getting the references for all the hosts
for idx,city in cities.iterrows():#.loc[cities['2017']>50000].iterrows():
    if idx>10:
        getReferenceForCity(idx)
        sleep(randint(5,10))

## 0.2. Data Merging



In [None]:
#Merging host's data
hosts = pd.DataFrame() #creates a new dataframe that's empty
for idx,city in cities.iterrows():#.loc[cities['2017']>50000].iterrows():
    for p in range(1,14):
        if (os.path.exists('data/Top50/hosts/'+city['name']+str(p)+'.csv')):
            df = pd.read_csv('data/Top50/hosts/'+city['name']+str(p)+'.csv',index_col=False,dtype={'id': np.int64})
            hosts = pd.concat([hosts,df]).reset_index(drop=True)
hosts=hosts.drop_duplicates()

In [None]:
#merging review data
rev = pd.DataFrame() #creates a new dataframe that's empty
for idx,city in cities.iterrows():#.loc[cities['2017']>50000].iterrows():
    if os.path.exists('data/Top50/reviews/'+city['name']+'Rev.csv'):
        df = pd.read_csv('data/Top50/reviews/'+city['name']+'Rev.csv',index_col=False)
        rev = pd.concat([rev,df]).reset_index(drop=True)
rev=rev.drop_duplicates()

## 0.3. Adding Geo Location of Users



In [None]:
cities_not_found = []
def getCityCoords(name):
    gn = geocoders.GeoNames(username='kacper')
    try:
        loc = gn.geocode(name, timeout=10)
        if loc is None:
            try: 
                loc =gn.geocode(name.split(',')[0],timeout=10)
                if loc is None:
                    raise TypeError
            except Exception as inst:
                print('This city was not found:',name)
                return None
    except Exception as inst:
        print('This city was not found:',name)
        cities_not_found.append(name)
        return None
    #print(len(loc))
   # print(loc[0].raw)
    return loc

In [None]:
#source: https://www.maxmind.com/en/free-world-cities-database
worldCities = pd.read_csv('../worldcitiespop.txt',encoding = "ISO-8859-1")
#source: https://datahub.io/core/country-list#resource-data
cc= pd.read_csv('data/country_map.txt')

In [15]:
#Example how the data look like:
print('Number of entries worldCitiespop.txt: ',len(worldCities))
worldCities.head(3)

Number of entries worldCitiespop.txt:  3173958


Unnamed: 0,Country,City,AccentCity,Region,Population,Latitude,Longitude
0,ad,aixas,Aixàs,6,,42.483333,1.466667
1,ad,aixirivali,Aixirivali,6,,42.466667,1.5
2,ad,aixirivall,Aixirivall,6,,42.466667,1.5


In [None]:

uniqueCities=rev.fromPublicAddressDescription.unique()
uniqueCities = uniqueCities.tolist()

In this Part we map a geographical position to every review.
With this information we know the reviewer geographical position.
Due to the high number of reviewer's unique locations and the geopy API's limitation of 5000 requests a day we used the worldcitiespop.txt.
If an entry couldn't be found in this list we used the geopy API to supplement the missing entries.
If a review's location couldn't be found it will be droppend afterwards.
The nummber of not found location is 815 out of 96085 with is a 0.848% of the reviews data.

In [None]:
exList=["nan"]
for city in unique:
    if all([c[1]=='NaN'for c in rev.loc[rev.fromPublicAddressDescription==city]['lat'].iteritems()]):
        if str(city).lower() not in exList:
            #Search for entry in the worldcities list and map with country code (cc) if necessary
            loc = worldCities.loc[worldCities.City==city.split(',')[0].lower().strip()]
            if len(loc.index)>1:
                try:
                    countryCode = cc.loc[cc.Name==city.split(',')[-1].strip().lower().title()]['Code'].values[0]
                    loc =loc.loc[loc.Country==countryCode.lower()].head(1)
                except:
                    loc=loc.head(1)
            if len(loc.head(1).index)==0:
                c=test=unidecode.unidecode(city)
                loc = worldCities.loc[worldCities.City==c.split(',')[0].lower().strip()].head(1)
                if len(loc.index)>1:
                    try:
                        countryCode = cc.loc[cc.Name==c.split(',')[-1].strip().lower().title()]['Code'].values[0]
                        loc =loc.loc[loc.Country==countryCode.lower()].head(1)
                    except:
                        loc=loc.head(1)  

            if len(loc.head(1).index)==1:
                l=rev.loc[rev.fromPublicAddressDescription==city].index.tolist()
                for i in l:
                    rev.set_value(i,'lat',loc.Latitude.values[0])
                    rev.set_value(i,'lng',loc.Longitude.values[0])
                    rev.set_value(i,'city',loc.City.values[0])
                    rev.set_value(i,'country',loc.Country.values[0])

            else:
            #is the entry not found in the worldcities use the API to download the geoposition    
                print('try api for :',city)
                res = getCityCoords(city)
                try:
                    l=rev.loc[rev.fromPublicAddressDescription==city].index.tolist()
                    for i in l:
                        rev.set_value(i,'lat',res.latitude)
                        rev.set_value(i,'lng',res.longitude)
                        rev.set_value(i,'city',res.address.split(' ')[0].lower())
                        rev.set_value(i,'country',res.address.split(' ')[-1].lower())
                except:
                    print('Could not found ',city)
export_csv = rev.to_csv ('data/reviews_total_geo_x.csv', index = None, header=True)

## 0.4. Cleaning Data

