In [1]:
import pandas as pd

In [2]:
!pip install pyspark
!pip install findspark
import findspark
findspark.init()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 55.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=25543efccefe48ceb543ba044e7c6e6cd84c3d64815c8cf606ab89016dda458d
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [35]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

ValueError: ignored

In [36]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", 
              "Manchester United": "Manchester Utd", 
              "Newcastle United": "Newcastle Utd", 
              "Tottenham Hotspur": "Tottenham", 
              "West Ham United": "West Ham", 
              "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [37]:
def frompd(path):
  matches = pd.read_csv(path, index_col=0)
  del matches["notes"]
  matches["date"] = pd.to_datetime(matches["date"])
  matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
  matches["day_code"] = matches["date"].dt.dayofweek
  matches["target"] = (matches["result"] == "W").astype("int")
  matches["venue_code"] =(matches["venue"] == "Away").astype("int")
  matches["new_team"] = matches["team"].map(mapping)

  return(matches)







In [38]:
matches=frompd("/content/drive/MyDrive/matches.csv")
matches2=frompd("/content/drive/MyDrive/matches2.csv")

In [39]:
df1 = spark.createDataFrame(matches) 
df2 = spark.createDataFrame(matches2)
df=df1.union(df2)

from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='new_team', outputCol='new_team_numeric').fit(df)
indexed_df1 = indexer.transform(df)

In [40]:
indexer = StringIndexer(inputCol='opponent', outputCol='opponent_numeric').fit(indexed_df1 )
df = indexer.transform(indexed_df1)

In [41]:
df.select('team','new_team_numeric').distinct().sort("new_team_numeric").show(100)

+--------------------+----------------+
|                team|new_team_numeric|
+--------------------+----------------+
|    Newcastle United|             0.0|
|         Southampton|             1.0|
|     West Ham United|             2.0|
|Brighton and Hove...|             3.0|
|   Manchester United|             4.0|
|   Tottenham Hotspur|             5.0|
|Wolverhampton Wan...|             6.0|
|             Arsenal|             7.0|
|         Aston Villa|             8.0|
|      Crystal Palace|             9.0|
|             Everton|            10.0|
|        Leeds United|            11.0|
|      Leicester City|            12.0|
|     Manchester City|            13.0|
|             Chelsea|            14.0|
|             Burnley|            15.0|
|           Liverpool|            16.0|
|           Brentford|            17.0|
|        Norwich City|            18.0|
|             Watford|            19.0|
|              Fulham|            20.0|
|    Sheffield United|            21.0|


In [42]:
df.show()

+-------------------+-----+--------------+------------+---+-----+------+---+---+---------------+---+---+----+----------+---------------+---------+--------------+------------+----+----+----+---+---+-----+------+---------------+----+--------+------+----------+---------------+----------------+----------------+
|               date| time|          comp|       round|day|venue|result| gf| ga|       opponent| xg|xga|poss|attendance|        captain|formation|       referee|match report|  sh| sot|dist| fk| pk|pkatt|season|           team|hour|day_code|target|venue_code|       new_team|new_team_numeric|opponent_numeric|
+-------------------+-----+--------------+------------+---+-----+------+---+---+---------------+---+---+----+----------+---------------+---------+--------------+------------+----+----+----+---+---+-----+------+---------------+----+--------+------+----------+---------------+----------------+----------------+
|2022-08-05 00:00:00|20:00|Premier League| Matchweek 1|Fri| Away|     W|2

In [43]:
from pyspark.ml.feature import VectorAssembler

In [44]:
df.columns

['date',
 'time',
 'comp',
 'round',
 'day',
 'venue',
 'result',
 'gf',
 'ga',
 'opponent',
 'xg',
 'xga',
 'poss',
 'attendance',
 'captain',
 'formation',
 'referee',
 'match report',
 'sh',
 'sot',
 'dist',
 'fk',
 'pk',
 'pkatt',
 'season',
 'team',
 'hour',
 'day_code',
 'target',
 'venue_code',
 'new_team',
 'new_team_numeric',
 'opponent_numeric']

In [45]:
assembler=VectorAssembler(inputCols=['hour',
 'day_code',
 'venue_code',
 'new_team_numeric',
 'opponent_numeric'],outputCol='features')

In [46]:
output=assembler.transform(df)

In [47]:
output.select(['features','target']).show(truncate=False)

+------------------------+------+
|features                |target|
+------------------------+------+
|[20.0,4.0,1.0,7.0,10.0] |1     |
|[15.0,5.0,0.0,7.0,13.0] |1     |
|[17.0,5.0,1.0,7.0,23.0] |1     |
|[17.0,5.0,0.0,7.0,20.0] |1     |
|[19.0,2.0,0.0,7.0,6.0]  |1     |
|[16.0,6.0,1.0,7.0,8.0]  |0     |
|[12.0,6.0,1.0,7.0,17.0] |1     |
|[12.0,5.0,0.0,7.0,3.0]  |1     |
|[16.0,6.0,0.0,7.0,2.0]  |1     |
|[14.0,6.0,1.0,7.0,12.0] |1     |
|[14.0,6.0,1.0,7.0,1.0]  |0     |
|[14.0,6.0,0.0,7.0,24.0] |1     |
|[12.0,6.0,1.0,7.0,15.0] |1     |
|[19.0,5.0,1.0,7.0,5.0]  |1     |
|[16.0,6.0,1.0,13.0,4.0] |1     |
|[15.0,5.0,0.0,13.0,23.0]|1     |
|[16.0,6.0,1.0,13.0,0.0] |0     |
|[15.0,5.0,0.0,13.0,10.0]|1     |
|[19.0,2.0,0.0,13.0,24.0]|1     |
|[17.0,5.0,1.0,13.0,6.0] |0     |
+------------------------+------+
only showing top 20 rows



In [48]:
model_df=output.select(['features','target'])

In [49]:
training_df,test_df=model_df.randomSplit([0.7,0.3])

In [50]:
from pyspark.ml.classification import RandomForestClassifier

In [51]:
rf=RandomForestClassifier(labelCol="target",numTrees=50).fit(training_df)

In [52]:
rf_predictions=rf.transform(test_df)

In [53]:
test_df.show()

+--------------------+------+
|            features|target|
+--------------------+------+
|[12.0,5.0,0.0,4.0...|     0|
|[12.0,5.0,0.0,7.0...|     1|
|[12.0,5.0,0.0,7.0...|     1|
|[12.0,5.0,0.0,13....|     1|
|[12.0,5.0,0.0,13....|     1|
|[12.0,5.0,0.0,20....|     0|
|[12.0,5.0,0.0,24....|     1|
|[12.0,5.0,1.0,5.0...|     1|
|[12.0,5.0,1.0,6.0...|     0|
|[12.0,5.0,1.0,7.0...|     1|
|[12.0,5.0,1.0,9.0...|     0|
|[12.0,5.0,1.0,13....|     1|
|[12.0,5.0,1.0,13....|     1|
|[12.0,5.0,1.0,14....|     1|
|[12.0,6.0,0.0,17....|     0|
|[12.0,6.0,1.0,7.0...|     1|
|[12.0,6.0,1.0,7.0...|     1|
|[14.0,6.0,0.0,1.0...|     0|
|[14.0,6.0,0.0,3.0...|     1|
|[14.0,6.0,0.0,4.0...|     0|
+--------------------+------+
only showing top 20 rows



In [57]:
rf.save("\model")

Py4JJavaError: ignored

In [54]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [55]:
rf_ac=BinaryClassificationEvaluator(labelCol="target").evaluate(rf_predictions)

In [56]:
rf_ac

0.731172578828829

In [58]:
!pip install pyTelegramBotAPI
!pip install python_telegram_bot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [59]:
import telebot
data=df.select('team','new_team_numeric').distinct().sort("new_team_numeric")

In [60]:
data=data.toPandas()

In [61]:
data2=df.select('opponent','opponent_numeric').distinct().sort("opponent_numeric")

In [62]:
data2=data2.toPandas()
data2

Unnamed: 0,opponent,opponent_numeric
0,Newcastle Utd,0.0
1,Southampton,1.0
2,Liverpool,2.0
3,Tottenham,3.0
4,West Ham,4.0
5,Wolves,5.0
6,Aston Villa,6.0
7,Brighton,7.0
8,Manchester Utd,8.0
9,Arsenal,9.0


In [63]:
bot = telebot.TeleBot('5883034778:AAGTKqrWb9uzLJGRbcZb0txTt_kjNtPjSmo')
predict=[]
@bot.message_handler(commands=['start'])
def get_message(message):
    bot.send_message(message.chat.id, f'enter 5 numbers \n first: hour of the game \n second: day of the week, where Monday is 1 Sunday is 7\n third:venue code, where away is 1, home is 0 \n fourth:number of the team  a team \n {data} \n fifth:number of opponent \n {data2}')

@bot.message_handler()
def get_message(message):
    predict=(message.text.split(" "))
    predict=tuple([float(x) for x in predict])
    pr=spark.createDataFrame([predict],['hour',
 'day_code',
 'venue_code',
 'new_team_numeric',
 'opponent_numeric'])
    output=assembler.transform(pr)
    rf_predictions=rf.transform(output)
    answer=rf_predictions.toPandas()
    bot.send_message(message.chat.id,f"{answer['prediction']} with probability {answer['probability'][0]}")


if __name__ == "__main__":
    bot.polling(none_stop=True, interval=0)

In [64]:
#spark.stop()