In [44]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def merge_asian_labels(label):
  asian_countries = ["China", "Japan", "South Korea", "Taiwan", "Hong Kong"]
  if label in asian_countries:
    return "Asia"
  return label

df = pd.read_csv('embeddings.csv')
nationalities = df['nationality']
usa_asia_labels = nationalities.map(merge_asian_labels)

lb = LabelBinarizer()
labels = lb.fit_transform(usa_asia_labels)
labels = labels.reshape(labels.size,)

X_train, X_test, y_train, y_test = train_test_split(df, labels)
features_train = X_train.drop(["name", "nationality", "text", "date"], axis = 1).values
features_test = X_test.drop(["name", "nationality", "text", "date"], axis = 1).values
scaler = StandardScaler()
ft_scaled_train = scaler.fit_transform(features_train)
scaler = StandardScaler()
ft_scaled_test = scaler.fit_transform(features_test)

model = LogisticRegression(random_state=0)
model.fit(ft_scaled_train, y_train)
y_pred = model.predict(ft_scaled_test)
print(classification_report(y_test, y_pred, target_names=lb.classes_))

               precision    recall  f1-score   support

         Asia       0.44      0.09      0.16       190
United States       0.99      1.00      0.99     15759

     accuracy                           0.99     15949
    macro avg       0.71      0.55      0.57     15949
 weighted avg       0.98      0.99      0.98     15949



In [54]:
classification_vals = np.array([x + 2*y for x, y in zip(y_pred, y_test)])

tp = np.array(np.where(classification_vals == 3)).tolist()[0]
fp = np.array(np.where(classification_vals == 1)).tolist()[0]
tn = np.array(np.where(classification_vals == 0)).tolist()[0]
fn = np.array(np.where(classification_vals == 2)).tolist()[0]

pd.set_option('display.max_colwidth', None)
display(X_test[["name", "nationality", "text"]].iloc[tn])
# display(X_test[["name", "nationality", "text"]].iloc[tp])

Unnamed: 0,name,nationality,text
734,Masahiro Tanaka,Japan,"Masahiro, do you anticipate coming back next year, or are you going to sit down after the season's over and think about whether you want to opt out of your contract?"
472,Shohei Ohtani,Japan,"This season started out kind of tumultuous for you in Korea with the Ippei situation and your first year with the Dodgers. Getting from that moment to right now mentally, what was this year like for you, and how were you able to get to this point?"
354,Shohei Ohtani,Japan,"Shohei, after this kind of season, making all those starts, all of those at-bats, how are you feeling physically?"
407,Daisuke Matsuzaka,Japan,"Sorry to keep bringing back last year, but at the beginning of the playoffs last season you mentioned that your approach would be to continue what you were doing through the regular season. Unlike Japan, where the playoffs don't start right away after the end of the regular season, you face your first start immediately, and what are some of the thoughts going through your mind now as compared to last year? \r\n DAISUKE MATSUZAKA (THROUGH INTERPRETER): As you said, the season just ended, and my start is already right around the corner, but during this week I've stayed on the same rhythm and stayed with the same program in terms of preparation and if I think too much that this is the playoffs, that this is going to be different, I think there is a danger there. \r\n If we can maintain our own pace as a team, I think we have a better chance of winning."
551,Shohei Ohtani,Japan,"Japan became only the second team to be undefeated in WBC history. In your position as a great voice for Japanese baseball, what is your message to the next generation of players looking up to you hoping to be in the same position as you one day?"
161,Yu Darvish,Japan,"Since you're pitching in the playoffs, there is probably going to be more nervousness and pressure, you're pitching in the playoffs compared to a regular season outing. But you mentioned changing the mechanics and your pitch selection, the approach has been changed. Do you think that's going to benefit going into the high-intensity game?"
37,Hyun-Jin Ryu,South Korea,"Hyun-Jin, yesterday when we did an interview with you, you mentioned this feels surreal, it feels almost like a road trip rather than an All-Star Game. How do you feel like sitting here in front of all these reporters? And when you signed with the Dodgers did you envision yourself speaking in front of this crowd as an All-Star pitcher?"
382,Kaz Matsui,Japan,"What does this streak that the team has been on feel like? Have you ever been through anything like that in Japan? And when you came back from your injury, do you feel like you brought new energy to the team in the middle of the streak?"
301,Hiroki Kuroda,Japan,Before the game Torre said that the pitcher has to pitch both sides of the plate. And what did you think about your pitching tonight?
564,Hyun-Jin Ryu,South Korea,The Nationals announced their rotation after the Dodgers did and they announced Max Scherzer as their No. 3 starter. What was your response to that or reaction to that?
