In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [2]:
file_path = 'intersections_data_1.csv'
data = pd.read_csv(file_path)
data['distance_diff'] = data['p2_distance_run'] - data['p1_distance_run']
data_1 = data[data['server'] == 1]
data_2 = data[data['server'] != 1]

In [14]:
"""
当Player1是发球方的分析
"""
# Feature select(server)
server_feature = ['Label', 'p1_sets', 'p2_sets', 'p1_games', 'p2_games', 'p1_double_fault', 'p1_unf_err', 'p2_unf_err', 'rally_count', 'speed_mph', 'serve_width_B', 'serve_width_BC', 'serve_width_BW', 'serve_width_C', 'serve_width_W', 'serve_depth_CTL', 'serve_depth_NCTL', 'distance_diff', 'p2_distance_run', 'p1_distance_run']
data_1 = data_1[server_feature]

data_1.to_csv('server_1_data.csv', index=False)

# Separating the features and the target variable
X = data_1.drop('Label', axis=1)
y = data_1['Label']

# Normalizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# # 选择要归一化的特征
# features_to_normalize = ['rally_count', 'speed_mph', 'distance_diff']
# 
# # 初始化归一化器
# scaler = MinMaxScaler()
# 
# # 只对选定的特征进行归一化
# X_scaled_subset = scaler.fit_transform(X[features_to_normalize])
# 
# # 将归一化后的数据转换为DataFrame
# X_scaled_subset_df = pd.DataFrame(X_scaled_subset, columns=features_to_normalize, index=X.index)
# 
# # 将未进行归一化的特征与归一化后的特征合并
# X_scaled = X.drop(features_to_normalize, axis=1)
# X_scaled = pd.concat([X_scaled, X_scaled_subset_df], axis=1)




# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=100)

# Training the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Getting the coefficients
coefficients = model.coef_[0]
feature_names = X.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Evaluating the model's accuracy on the test set
accuracy = model.score(X_test, y_test)

# Display coefficients and accuracy
print(f"When Player1 plays server:")
print(f'Coefficient of features: \n{coefficients_df}')
print(f'\nAccuracy of model: {accuracy}')
print("***********************************************")

When Player1 plays server:
Coefficient of features: 
             Feature  Coefficient
0            p1_sets    -0.660716
1            p2_sets     1.079027
2           p1_games     0.523895
3           p2_games    -0.461889
4    p1_double_fault     0.360961
5         p1_unf_err    -0.310627
6         p2_unf_err    -0.924556
7        rally_count     0.404886
8          speed_mph    -0.641765
9      serve_width_B    -0.342782
10    serve_width_BC    -0.237314
11    serve_width_BW    -0.029538
12     serve_width_C     0.361789
13     serve_width_W     0.111184
14   serve_depth_CTL    -0.393256
15  serve_depth_NCTL     0.391367
16     distance_diff    -0.314234
17   p2_distance_run    -0.570710
18   p1_distance_run    -0.515582

Accuracy of model: 0.75
***********************************************


In [13]:
"""
当Player1是接球方的分析
"""
# Feature select(server)
servee_feature = ['Label', 'p1_sets', 'p2_sets', 'p1_games', 'p2_games', 'p2_double_fault', 'p1_unf_err', 'p2_unf_err', 'rally_count', 'speed_mph', 'return_depth_D', 'return_depth_ND', 'distance_diff', 'p2_distance_run', 'p1_distance_run']
data_2 = data_2[servee_feature]

data_2.to_csv('server_2_data.csv', index=False)

# Separating features and target variable
X_2 = data_2.drop('Label', axis=1)
y_2 = data_2['Label']

# Normalizing the data
scaler_2 = StandardScaler()
X_2_scaled = scaler_2.fit_transform(X_2)

# # 选择要归一化的特征
# features_to_normalize = ['rally_count', 'speed_mph', 'distance_diff']
# 
# # 初始化归一化器
# scaler_2 = MinMaxScaler()
# 
# # 只对选定的特征进行归一化
# X_2_scaled_subset = scaler_2.fit_transform(X_2[features_to_normalize])
# 
# # 将归一化后的数据转换为DataFrame
# X_2_scaled_subset_df = pd.DataFrame(X_2_scaled_subset, columns=features_to_normalize, index=X_2.index)
# 
# # 将未进行归一化的特征与归一化后的特征合并
# X_2_scaled = X_2.drop(features_to_normalize, axis=1)
# X_2_scaled = pd.concat([X_2_scaled, X_2_scaled_subset_df], axis=1)



# Splitting the data into training and test sets
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2_scaled, y_2, test_size=0.2, random_state=0)

# Training a new logistic regression model on the second dataset
model_2 = LogisticRegression()
model_2.fit(X_2_train, y_2_train)

# Getting the coefficients for the second model
coefficients_2 = model_2.coef_[0]
feature_names_2 = X_2.columns
coefficients_df_2 = pd.DataFrame({'Feature': feature_names_2, 'Coefficient': coefficients_2})

# Evaluating the accuracy of the second model on the test set
accuracy_2 = model_2.score(X_2_test, y_2_test)

# Display coefficients and accuracy
print(f"When Player1 plays servee:")
print(f'Coefficient of features: \n{coefficients_df_2}')
print(f'\nAccuracy of model: {accuracy_2}')
print("***********************************************")

When Player1 plays servee:
Coefficient of features: 
            Feature  Coefficient
0           p1_sets     0.124868
1           p2_sets    -0.149697
2          p1_games     0.519961
3          p2_games    -0.155793
4   p2_double_fault     0.643224
5        p1_unf_err    -0.136075
6        p2_unf_err    -0.087853
7       rally_count    -0.305280
8         speed_mph     0.205257
9    return_depth_D     0.259512
10  return_depth_ND     0.372136
11    distance_diff    -0.093827
12  p2_distance_run     0.399639
13  p1_distance_run     0.333224

Accuracy of model: 0.782608695652174
***********************************************
