In [5]:
# -*- coding: utf-8 -*-
from lxml import html
import requests
import pandas as pd
import numpy as np
import time

#extract times and comments for any given match
def extract_commentary_ESPN(id):
    id=str(id)
    page = requests.get('http://www.espnfc.com/commentary/'+id+'/commentary.html')
    tree = html.fromstring(page.text)
    times = tree.xpath('//div[@class="timestamp"]/p/text()')
    comments = tree.xpath('//div[@class="comment"]/p/text()')
    df = pd.DataFrame({'times':times, 'comments':comments})
    return df

#let's test with a random match
my_match=extract_commentary_ESPN(402356)
my_match.head(10)

Unnamed: 0,comments,times
0,"Second Half ends, Barcelona 2, Real Madrid 1.",90'
1,"Corner, Real Madrid. Conceded by Dani Alves.",89'
2,"Substitution, Real Madrid. Lucas Silva replace...",88'
3,"Corner, Real Madrid. Conceded by Gerard Piqué.",88'
4,Attempt saved. Lionel Messi (Barcelona) left f...,87'
5,Attempt missed. Luis Suárez (Barcelona) right ...,86'
6,"Corner, Barcelona. Conceded by Iker Casillas.",86'
7,Attempt saved. Jordi Alba (Barcelona) left foo...,86'
8,Rafinha (Barcelona) wins a free kick in the de...,85'
9,Foul by Daniel Carvajal (Real Madrid).,85'


In [6]:
#remove the last character of "times" column
def delete_last(a):
    return (a[:-1])

my_match.times=my_match.times.apply(delete_last)
my_match.head(10)

Unnamed: 0,comments,times
0,"Second Half ends, Barcelona 2, Real Madrid 1.",90
1,"Corner, Real Madrid. Conceded by Dani Alves.",89
2,"Substitution, Real Madrid. Lucas Silva replace...",88
3,"Corner, Real Madrid. Conceded by Gerard Piqué.",88
4,Attempt saved. Lionel Messi (Barcelona) left f...,87
5,Attempt missed. Luis Suárez (Barcelona) right ...,86
6,"Corner, Barcelona. Conceded by Iker Casillas.",86
7,Attempt saved. Jordi Alba (Barcelona) left foo...,86
8,Rafinha (Barcelona) wins a free kick in the de...,85
9,Foul by Daniel Carvajal (Real Madrid).,85


In [7]:
#Finding corners occurred in the match:

#convert times to number, since they are a string
my_match.times=my_match.times.convert_objects(convert_numeric=True)
#filtering rows where comment string starts with 'Corner'
my_match[my_match.comments.str.startswith('Corner')]

Unnamed: 0,comments,times
1,"Corner, Real Madrid. Conceded by Dani Alves.",89
3,"Corner, Real Madrid. Conceded by Gerard Piqué.",88
6,"Corner, Barcelona. Conceded by Iker Casillas.",86
14,"Corner, Real Madrid. Conceded by Claudio Bravo.",78
48,"Corner, Real Madrid. Conceded by Gerard Piqué.",58
51,"Corner, Barcelona. Conceded by Sergio Ramos.",53
52,"Corner, Barcelona. Conceded by Sergio Ramos.",53
69,"Corner, Real Madrid. Conceded by Claudio Bravo.",43
73,"Corner, Real Madrid. Conceded by Gerard Piqué.",39
87,"Corner, Barcelona. Conceded by Marcelo.",31


In [8]:
#calculate minute of first corner
#filtering rows where comment string starts with 'Corner' and returning minimum time
np.amin(my_match[my_match.comments.str.startswith('Corner')]).times

3.0

In [10]:
#putting calculation in a function for further use
def calculate_minc1(df):
    df.times=df.times.apply(delete_last)
    #convert times to number, since they are a string
    df.times=df.times.convert_objects(convert_numeric=True)
    #filtering rows where comment string starts with 'Corner' and returning minimum time
    return np.amin(df[df.comments.str.startswith('Corner')]).times

In [11]:
#Extract Match Statistics: teams, total corners
def extract_statistics_ESPN(id):
    id=str(id)
    page = requests.get('http://www.espnfc.com/gamecast/statistics/id/'+id+'/statistics.html')
    tree = html.fromstring(page.text)
    team_away = tree.xpath('//div[@class="team away"]/p/a/text()')#visitante
    team_home=tree.xpath('//div[@class="team home"]/p/a/text()')#local
    script=tree.xpath('//*[@id="matchcenter-'+id+'"]/div[1]/p[1]/span/script/text()')
    try:
        timestamp = float(script[0][52:65])
        fecha=time.strftime("%a %d %b %Y %H:%M:%S GMT", time.gmtime(timestamp / 1000.0))
    except:
        fecha="UNKNOWN"
    competicion = tree.xpath('//div[@class="match-details"]/p[@class="floatleft"]/text()')
    try:
        competicion= competicion[0].strip()
    except:
        competicion="ERR COMPETICION"
    home_corners = tree.xpath('//td[@id="home-corner-kicks"]/text()')
    away_corners = tree.xpath('//td[@id="away-corner-kicks"]/text()')
    return pd.Series([id,team_away,team_home,competicion,home_corners,away_corners,fecha],index=['id','team_home','team_away','competition','corners_home','corners_away','date'])

extract_statistics_ESPN(402356)

id                                    402356
team_home                        [Barcelona]
team_away                      [Real Madrid]
competition         SPANISH PRIMERA DIVISIÓN
corners_home                             [7]
corners_away                            [10]
date            Sun 22 Mar 2015 20:00:00 GMT
dtype: object

In [12]:
#Put together the extracted info

def process_match(id):
    my_match=extract_commentary_ESPN(id)
    if len(my_match)>1:
        processed_match=extract_statistics_ESPN(id)
        minc1=pd.Series([calculate_minc1(my_match)],index=['minc1'])
        processed_match=processed_match.append(minc1)
        bad_match=False
    else:
        bad_match=True
    return processed_match,bad_match
process_match(402356)

(id                                    402356
 team_home                        [Barcelona]
 team_away                      [Real Madrid]
 competition         SPANISH PRIMERA DIVISIÓN
 corners_home                             [7]
 corners_away                            [10]
 date            Sun 22 Mar 2015 20:00:00 GMT
 minc1                                      3
 dtype: object, False)

In [13]:
#We need to extract chunks of matches and put them in a dataframe

def analyze_chunk(ids):
    partidos_df=pd.DataFrame()
    for id in ids:
        match_stats,partido_incorrecto=process_match(id)
        if partido_incorrecto==False:
            partidos_df=partidos_df.append(match_stats,ignore_index=True)
    return partidos_df

analyze_chunk(range(402355,402360))

Unnamed: 0,competition,corners_away,corners_home,date,id,minc1,team_away,team_home
0,SPANISH PRIMERA DIVISIÓN,[4],[7],Sat 21 Mar 2015 17:00:00 GMT,402355,2,[Malaga],[Rayo Vallecano]
1,SPANISH PRIMERA DIVISIÓN,[10],[7],Sun 22 Mar 2015 20:00:00 GMT,402356,3,[Real Madrid],[Barcelona]
2,SPANISH PRIMERA DIVISIÓN,[2],[8],Sun 22 Mar 2015 16:00:00 GMT,402357,15,[Sevilla FC],[Villarreal]
3,SPANISH PRIMERA DIVISIÓN,[3],[4],Fri 20 Mar 2015 19:45:00 GMT,402358,7,[Valencia],[Elche]
4,SPANISH PRIMERA DIVISIÓN,[5],[8],Sat 14 Mar 2015 21:00:00 GMT,402359,8,[Athletic Bilbao],[Celta Vigo]


In [18]:
#We will save time by not extracting info from already-scraped matches
def select_ids(id_arange,file):
    destination_df=pd.read_csv(file) #read the file
    criterion=id_arange.isin(destination_df.id)
    return id_arange[-criterion].values

corner_file="corners_append.csv"
scraped_df=pd.read_csv(corner_file) #read the file
print "EXISTING:"
print scraped_df.id
#check what id_matches from the range have already been scraped
raw_range=pd.Series(np.arange(402357,402365))
analysis_range=select_ids(raw_range,corner_file)
print "MISSING AND TO BE ANALYZED:"
print analysis_range

EXISTING:
0     402400
1     402401
2     402402
3     402403
4     402404
5     402405
6     402406
7     402407
8     402408
9     402409
10    402410
11    402411
12    402412
13    402413
14    402414
15    402415
16    402416
17    402417
18    402418
19    402419
20    402420
21    402421
22    402422
23    402423
24    402424
25    402425
26    402426
27    402427
28    402428
29    402429
30    402430
Name: id, dtype: int64
MISSING AND TO BE ANALYZED:
[402357 402358 402359 402360 402361 402362 402363 402364]


In [8]:
#study and save a range
def study_range (id_ini,id_fin,corner_file):
    #check what id_matches from the range have already been scraped
    raw_range=pd.Series(np.arange(id_ini,id_fin+1))
    analysis_range=select_ids(raw_range,corner_file)
    #perform scraping of missing id_matches
    chunk_df=analyze_chunk(analysis_range)
    #append results to file if there are any
    try:
        select_matches_append(chunk_df,corner_file)
    except:
        print "NOTHING ADDED"
study_range(402360,402400,'corners_append.csv')

NOTHING ADDED


In [12]:
#study in chunks of 50s and calculate time
import timeit
def study_in_chunks(ini,last,step,file):
    for i in range(ini,last+1,step):
        start = timeit.default_timer()
        study_range(i,i+step,file)
        print i
        stop = timeit.default_timer()
        print (stop - start)/60
        print "MINUTOS"
    
study_in_chunks(402400,402600,50,'corners_append.csv')

NOTHING ADDED
402400
0.000299648443858
MINUTOS
NOTHING ADDED
402450
0.000309149424235
MINUTOS
NOTHING ADDED
402500
0.000362551212311
MINUTOS
NOTHING ADDED
402550
0.000305199623108
MINUTOS
402600
0.0565403461456
MINUTOS


In [1]:
# -*- coding: utf-8 -*-
from lxml import html
import requests
import pandas as pd
import numpy as np
import time
import timeit

def extract_commentary_ESPN(id):
    id=str(id)
    page = requests.get('http://www.espnfc.com/commentary/'+id+'/commentary.html')
    tree = html.fromstring(page.text)
    times = tree.xpath('//div[@class="timestamp"]/p/text()')
    comments = tree.xpath('//div[@class="comment"]/p/text()')
    df = pd.DataFrame({'times':times, 'comments':comments})
    return df

#remove the last two characters of "times" column
def delete_last(a):
    return (a[:-1])

#calculate minute of first corner
def calculate_minc1(df):
    #remove the last two characters of "times" column
    df.times=df.times.apply(delete_last)
    #convert times to number, since they are a string
    df.times=df.times.convert_objects(convert_numeric=True)
    #filtering rows where comment string starts with 'Corner' and returning minimum time
    return np.amin(df[df.comments.str.startswith('Corner')]).times

#Extract Match Statistics: teams, total corners
def extract_statistics_ESPN(id):
    id=str(id)
    page = requests.get('http://www.espnfc.com/gamecast/statistics/id/'+id+'/statistics.html')
    tree = html.fromstring(page.text)
    team_away = tree.xpath('//div[@class="team away"]/p/a/text()')#visitante
    team_home=tree.xpath('//div[@class="team home"]/p/a/text()')#local
    script=tree.xpath('//*[@id="matchcenter-'+id+'"]/div[1]/p[1]/span/script/text()')
    try:
        timestamp = float(script[0][52:65])
        fecha=time.strftime("%a %d %b %Y %H:%M:%S GMT", time.gmtime(timestamp / 1000.0))
    except:
        fecha="UNKNOWN"
    competicion = tree.xpath('//div[@class="match-details"]/p[@class="floatleft"]/text()')
    try:
        competicion= competicion[0].strip()
    except:
        competicion="ERR COMPETICION"
    home_corners = tree.xpath('//td[@id="home-corner-kicks"]/text()')
    away_corners = tree.xpath('//td[@id="away-corner-kicks"]/text()')
    return pd.Series([id,team_away,team_home,competicion,home_corners,away_corners,fecha],index=['id','team_home','team_away','competition','corners_home','corners_away','date'])

#Put together the extracted info
def process_match(id):
    global processed_match
    global bad_match
    my_match=extract_commentary_ESPN(id)
    if len(my_match)>1:
        processed_match=extract_statistics_ESPN(id)
        minc1=pd.Series([calculate_minc1(my_match)],index=['minc1'])
        processed_match=processed_match.append(minc1)
        bad_match=False
    else:
        partido_incorrecto=True
    return processed_match,bad_match

#We need to extract chunks of matches and present them in a dataframe
def analyze_chunk(ids):
    partidos_df=pd.DataFrame()
    for id in ids:
        match_stats,partido_incorrecto=process_match(id)
        if partido_incorrecto==False:
            partidos_df=partidos_df.append(match_stats,ignore_index=True)
    return partidos_df

#Append results to a csv file
def append_to_csv(file,df):
    f = open(file, 'a') # Añadir los resultados al archivo de corners
    df.to_csv(f,header=False,encoding="utf-8")
    f.close()

#append only new matches
def select_matches_append(obtained_df,file):
    destination_df=pd.read_csv(file) #read the file
    obtained_df.id=obtained_df.id.convert_objects(convert_numeric=True)
    criterion=obtained_df.id.isin(destination_df.id)
    append_to_csv(file,obtained_df[-criterion])#inverse, because we want to include what is NOT already in the file

#We will save time by not extracting info from already-scraped matches
def select_ids(id_arange,file):
    destination_df=pd.read_csv(file) #read the file
    criterion=id_arange.isin(destination_df.id)
    return id_arange[-criterion].values

#study and save a range
def study_range (id_ini,id_fin,corner_file):
    #check what id_matches from the range have already been scraped
    raw_range=pd.Series(np.arange(id_ini,id_fin+1))
    analysis_range=select_ids(raw_range,corner_file)
    #perform scraping of missing id_matches
    chunk_df=analyze_chunk(analysis_range)
    #append results to file if there are any
    try:
        select_matches_append(chunk_df,corner_file)
    except:
        print "NOTHING ADDED"

#study in chunks of 100s and calculate time    
def study_in_chunks(ini,last,step,file)
    for i in range(ini,last,step):
        start = timeit.default_timer()
        study_range(i,i+step,file)
        print i
        stop = timeit.default_timer()
        print (stop - start)/60
        print "MINUTOS"
    
study_in_chunks(402400,402600,50,'corners_append.csv')

NOTHING ADDED


Unnamed: 0.1,Unnamed: 0,competition,corners_away,corners_home,date,id,minc1,team_away,team_home
24,14,SPANISH PRIMERA DIVISIÓN,[2],[8],Sun 01 Feb 2015 18:00:00 GMT,402424,3,[Espanyol],[Sevilla FC]
25,15,SPANISH PRIMERA DIVISIÓN,[3],[6],Sun 01 Feb 2015 16:00:00 GMT,402425,9,[Getafe],[Almeria]
26,16,SPANISH PRIMERA DIVISIÓN,[3],[10],Sat 31 Jan 2015 15:00:00 GMT,402426,1,[Real Sociedad],[Real Madrid]
27,17,SPANISH PRIMERA DIVISIÓN,[3],[5],Mon 02 Feb 2015 19:45:00 GMT,402427,13,[Valencia],[Malaga]
28,18,SPANISH PRIMERA DIVISIÓN,[2],[8],Sun 01 Feb 2015 20:00:00 GMT,402428,1,[Villarreal],[Barcelona]
29,19,SPANISH PRIMERA DIVISIÓN,[7],[9],Sun 25 Jan 2015 18:00:00 GMT,402429,17,[Almeria],[Espanyol]
30,20,SPANISH PRIMERA DIVISIÓN,[3],[2],Sat 24 Jan 2015 17:00:00 GMT,402430,26,[Barcelona],[Elche]
31,0,SPANISH PRIMERA DIVISIÓN,[2],[6],Mon 26 Jan 2015 19:45:00 GMT,402431,7,[Celta Vigo],[Getafe]
32,1,SPANISH PRIMERA DIVISIÓN,[5],[9],Sat 24 Jan 2015 21:00:00 GMT,402432,3,[Eibar],[Real Sociedad]
33,2,SPANISH PRIMERA DIVISIÓN,[2],[7],Sun 25 Jan 2015 11:00:00 GMT,402433,11,[Granada],[Deportivo La Coruña]


In [2]:
#We want to check for statistical significance

corner_file="corners_append.csv"
analysis_df=pd.read_csv(corner_file)
total_matches=len(analysis_df)-1
corner_true=len(analysis_df[analysis_df.minc1<10])
print total_matches,corner_true

33 23


In [6]:
import scipy
from scipy import stats
import numpy as np

expected_odds=1/1.83 #expected 54%
expected_true=total_matches*expected_odds

observed = np.array([corner_true,total_matches-corner_true])
expected = np.array([expected_true,total_matches-expected_true])

scipy.stats.chisquare(observed, f_exp=expected)

(3.0167250821467695, 0.08240972920843942)

In [56]:
def chisquare_on_csv(expected_freq,file):
    analysis_df=pd.read_csv(file)
    #we drop every match where minc1=NaN
    analysis_df=analysis_df.dropna(subset = ['minc1'])
    total_matches=len(analysis_df)-1
    corner_true=len(analysis_df[analysis_df.minc1<10])
    expected_true=total_matches*expected_freq
    observed = np.array([corner_true,total_matches-corner_true])
    expected = np.array([expected_true,total_matches-expected_true])
    chsq,pval= scipy.stats.chisquare(observed,f_exp=expected)
    proportion_true=float((corner_true*100)/total_matches)
    return str(chsq),str(pval),str(total_matches),str(corner_true),str(proportion_true)
    
    
expected_freq=1/1.83 #expected 54%
chsq,pval,total_matches,corner_true,proportion_true=chisquare_on_csv(expected_freq,'corners_append.csv')
print "Expected: "+str(expected_freq*100)+"%, Observed: "+proportion_true+"% on "+total_matches+" matches"
print "Your p-value is "+pval
print "The p value is the probability that the deviation of the observed from that expected"
print "is due to chance alone (no other forces acting)"

Expected: 54.6448087432%, Observed: 60.0% on 790 matches
Your p-value is 0.00120441672208
The p value is the probability that the deviation of the observed from that expected
is due to chance alone (no other forces acting)


In [28]:
#delete brackets
def remove_brackets(string):
    string2=string.replace("[", "")
    return string2.replace("]","")


analysis_df=pd.read_csv('corners_append.csv')
analysis_df.corners_away=analysis_df.corners_away.apply(remove_brackets)
analysis_df.corners_home=analysis_df.corners_home.apply(remove_brackets)
analysis_df.team_away=analysis_df.team_away.apply(remove_brackets)
analysis_df.team_home=analysis_df.team_home.apply(remove_brackets)


4687
704


('3916.18602903', '0.0', '4686', '428')

In [61]:
corner_file="corners_append.csv"
analysis_df=pd.read_csv(corner_file)

In [146]:
#pasar fecha a ordinal
import time
import datetime
import numpy as np
def convert_to_datetime(date):
    string2=(date[4:15])
    conv_time=time.strptime(string2,'%d %b %Y')
    d=datetime.date(conv_time.tm_year, conv_time.tm_mon, conv_time.tm_mday)
    return d
string=analysis_df.date[10]
d= convert_to_datetime(string)
print d

2015-02-08


In [144]:
print d.month

2


In [None]:
#starting machine learning
import numpy as np
perm = np.random.permutation(y.size)
print perm
PRC = 0.7
split_point = int(np.ceil(y.shape[0]*PRC))