In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
import keras.utils as ku 
from keras.models import load_model

import pandas as pd
import numpy as np
import string, os

In [None]:
df_woj = pd.read_csv('wojespn_tweets.csv')
df_shams = pd.read_csv('shamscharania_tweets.csv')

# Create functions for both models

In [None]:
def df_to_text(df):
    
    df1 = df.copy()
    
    df1['text'] = df['text'].str[2:-1] #each string starts with b' and ends with ', so this removes those characters
    df1['text'] = df1['text'].str.lower()
    
    words_list = df1['text'].values
    
    for i in ['https://', '@', '\\']:
        words_list = [word for word in words_list if not i in word]
    
    return words_list

In [None]:
def get_sequence_of_tokens(corpus):
    
    tokenizer = Tokenizer()
    
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    input_seq = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_seq.append(n_gram_sequence)
    return input_seq, total_words, tokenizer

In [None]:
def generate_padded_sequences(input_seq, total_words):
    max_seq_len = max([len(x) for x in input_seq])
    input_seq = np.array(pad_sequences(input_seq, maxlen = max_seq_len, padding = 'pre'))
    
    predictors, label = input_seq[:,:-1], input_seq[:,-1]
    label = ku.to_categorical(label, num_classes = total_words)
    return predictors, label, max_seq_len

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    model.add(Embedding(total_words, 16, input_length=input_len))
    
    model.add(LSTM(256))
    
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

# Create Woj model

In [None]:
woj_tweets = df_to_text(df_woj)

In [None]:
input_seq, total_words_woj, tokenizer_woj = get_sequence_of_tokens(woj_tweets)

In [None]:
predictors, label, max_seq_len_woj = generate_padded_sequences(input_seq, total_words_woj)

In [None]:
model = create_model(max_seq_len_woj, total_words_woj)
model.summary()

In [None]:
model.fit(predictors, label, epochs = 100, batch_size = 64)

In [None]:
model.save('woj_model.h5')

# Create Shams model

In [None]:
shams_tweets = df_to_text(df_shams)

In [None]:
input_seq, total_words_shams, tokenizer_shams = get_sequence_of_tokens(shams_tweets)

In [None]:
predictors, label, max_seq_len_shams = generate_padded_sequences(input_seq, total_words_shams)

In [None]:
model = create_model(max_seq_len_shams, total_words_shams)
model.summary()

In [None]:
model.fit(predictors, label, epochs = 100, batch_size = 64)

In [None]:
model.save('shams_model.h5')

# Text generation function

In [None]:
def generate_text(seed_text, next_words, model, max_seq_len, tokenizer):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen = max_seq_len - 1, padding = 'pre')
        predicted = model.predict_classes(token_list, verbose = 0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    print(seed_text.title() + '\n')

# Text generation: Woj

In [None]:
woj_model = load_model('woj_model.h5')
model = woj_model

In [None]:
players = ['Giannis Antetokounmpo', 'LeBron James', 'James Harden', 'Kawhi Leonard', 'Kevin Durant', 'Stephen Curry',
           'Ben Simmons', 'Kemba Walker', 'Anthony Davis', 'Russell Westbrook', 'Damian Lillard', 'Kyrie Irving', 'Luka Doncic',
           'Zion Williamson', 'Trae Young']

In [None]:
for i in players:
    generate_text(i, 35, model, max_seq_len_woj, tokenizer_woj)

In [None]:
teams = ['Atlanta Hawks', 'Boston Celtics', 'Brooklyn Nets', 'Charlotte Hornets', 'Chicago Bulls', 'Cleveland Cavaliers',
         'Dallas Mavericks', 'Denver Nuggets', 'Detroit Pistons', 'Golden State Warriors', 'Houston Rockets', 'Indiana Pacers',
         'Los Angeles Clippers', 'Los Angeles Lakers', 'Memphis Grizzlies', 'Miami Heat', 'Milwaukee Bucks',
         'Minnesota Timberwolves', 'New Orleans Hornets', 'New York Knicks', 'Oklahoma City Thunder', 'Orlando Magic',
         'Philadelphia 76ers', 'Phoenix Suns', 'Portland Trail Blazers', 'Sacramento Kings', 'San Antonio Spurs',
         'Toronto Raptors', 'Utah Jazz', 'Washington Wizards']

In [None]:
for i in teams:
    generate_text("The " + i, 35, model, max_seq_len_woj, tokenizer_woj)

# Text generation: Shams

In [None]:
shams_model = load_model('shams_model.h5')
model = shams_model

In [None]:
for i in players:
    generate_text("Sources: " + i, 35, model, max_seq_len_shams, tokenizer_shams)

In [None]:
for i in teams:
    generate_text("Sources: the " + i, 35, model, max_seq_len_shams, tokenizer_shams)