In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
import seaborn as sns
from sklearn.model_selection import cross_val_score, cross_validate
from scipy import stats
import matplotlib
from sklearn.ensemble import IsolationForest
from tqdm import tqdm

In [44]:
def load():
    df = pd.read_csv("a.csv/a.csv", decimal='.')
    return df

In [45]:
def filter_pass(df):
    return df

def filter_tral(df):
    result_df = df[df['sure_tral'] == 1]
    return result_df

def filter_not_tral(df):
    result_df = df[df['sure_tral'] == 0]
    return result_df

def filter_equal(df, frac=1.0):
    tral_df = df[df['sure_tral'] == 1]
    # print('tral shape', tral_df.shape)
    not_tral_df = df[df['sure_tral'] != 1]
    not_tral_df = not_tral_df.sample(n=int(tral_df.shape[0]*frac))
    # print('not tral shape', not_tral_df.shape)
    frames = [tral_df, not_tral_df]
    result = pd.concat(frames)
    # print('result shape', result.shape)
    return result

In [46]:
def general_analyze(df):
    tral_df = filter_tral(df)
    not_tral_df = filter_not_tral(df)

    df.info()
    print('********************')
    print('ships count: ', df['ship'].unique().shape[0])
    print('records count: ', df['record'].unique().shape[0])
    print('********************')
    print('class disctribution:')
    print('not trall class - {:.3}'.format(tral_df.shape[0] / df.shape[0]), '({})'.format(tral_df.shape[0]))
    print('trall class - {:.3}'.format(not_tral_df.shape[0] / df.shape[0]), '({})'.format(not_tral_df.shape[0]))
    print('********************')
    print('missed values:')
    for column_name in df.columns:
        df[df['ship'] == None]
        count = df[df[column_name] == 'None'].shape[0]
        print(column_name, count)
    print('not tral error rows:', 
          df[((df['course'] == 'None') | (df['velocity'] == 'None')) & (df['sure_tral'] == 0)].shape[0])
    print('tral error rows:', 
          df[((df['course'] == 'None') | (df['velocity'] == 'None')) & (df['sure_tral'] == 1)].shape[0])

In [47]:
# раскомменитровать для использования
# df = load()
# general_analyze(df)

In [48]:
def proccessing(df, frac=0.01):
    if not frac is None:
        df_sampled = df.sample(frac=frac)
    else:
        df_sampled = df
    df_clear = df_sampled[df_sampled.velocity != 'None']
    df_clear = df_clear[df_clear.course != 'None']
    df_clear['velocity'] = df_clear['velocity'].astype(float)
    df_clear['course'] = df_clear['course'].astype(int)
    return df_clear

In [49]:
def split(df):
    df_x = df[['course', 'velocity']]
    df_y = df[['sure_tral']]
    return df_x, df_y

In [50]:
def learn(df_x, df_y):
    # print('clf x', df_x.shape)
    # print(df_x.head)
    # print('clf y', df_y.shape)
    # print(df_y.head)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(df_x, df_y)
    return clf

In [51]:
# def evaluate(clf, df_x, df_y):
#     y_predict = clf.predict(df_x)
#     y_real = df_y.to_numpy().reshape((1, -1))
#     # print('predict sum', np.sum(y_predict))
#     # print('real sum', np.sum(y_real))
#     diff = np.sum(y_predict != y_real)
#     # print('diff', diff)
#     value = diff / len(y_predict)
#     # print('len', len(y_predict))
#     return value

def cross_val_learn(clf, X, y):
    learning_result = cross_validate(clf, X, y, return_estimator=True)
    scores = learning_result['test_score']
    score = scores.mean()
    trained_clf = learning_result['estimator'][0]
    
    return score, trained_clf

In [52]:
def viz_features(df):
    g = sns.pairplot(df, hue='sure_tral', palette="YlGnBu")


In [53]:
# def get_result(train_frac, test_frac, test_filter_fn=filter_pass, train_filter_fn=filter_pass):
#     source_df = load()
#     train_df = proccessing(source_df, frac=train_frac)
#     test_df = proccessing(source_df, frac=test_frac)
#     # print('train df shape before filter', train_df.shape)
#     train_df=train_filter_fn(train_df)
#     # print('train df shape after filter', train_df.shape)
#     df_train_x, df_train_y = split(train_df)
#     test_df=test_filter_fn(test_df)
#     df_test_x, df_test_y = split(test_df)
#     clf = learn(df_train_x, df_train_y)
#     print(evaluate(clf, df_test_x, df_test_y))
#     return clf

def get_result(train_frac, test_frac, test_filter_fn=filter_pass, train_filter_fn=filter_pass):
    source_df = load()
    # train_df = proccessing(source_df, frac=train_frac)
    # test_df = proccessing(source_df, frac=test_frac)
    clear_df = proccessing(source_df, frac=None)
    train_df=train_filter_fn(clear_df)
    # print('train df shape after filter', train_df.shape)
    df_train_x, df_train_y = split(train_df)
    # test_df=test_filter_fn(test_df)
    # df_test_x, df_test_y = split(test_df)
    clf = tree.DecisionTreeClassifier()
    score, trained_clf = cross_val_learn(clf, df_train_x, df_train_y)
    print(score)
    return trained_clf

In [54]:
def get_many_result():
    get_result(train_frac=0.1, test_frac=1, test_filter_fn=filter_pass)
    get_result(train_frac=0.1, test_frac=1, test_filter_fn=filter_tral)
    get_result(train_frac=0.1, test_frac=1, test_filter_fn=filter_equal)
    get_result(train_frac=0.1, test_frac=1, test_filter_fn=filter_equal, train_filter_fn=filter_equal)
    get_result(train_frac=0.1, test_frac=1, test_filter_fn=filter_pass, train_filter_fn=filter_equal)
    clf = get_result(train_frac=0.3, test_frac=1, test_filter_fn=filter_equal, train_filter_fn=filter_equal)
    print(clf.predict([[330, 7.56]]))

In [55]:
def validate(clf):
    source_df = pd.read_csv("2people_with_txt_samples/control.csv", decimal='.')
    source_df.head
    # source_df['sure_tral']='x'
    # general_analyze(source_df)

    df_clear = source_df
    df_clear['velocity'] = df_clear['velocity'].replace('None', '0.0')
    df_clear['course'] = df_clear['course'].replace('None', '0')
    df_clear['velocity'] = df_clear['velocity'].astype(float)
    df_clear['course'] = df_clear['course'].astype(int)
    df_result = df_clear
    df_result['sure_tral'] = clf.predict(df_clear[['course', 'velocity']])
    df_result.head

    result = df_result[["record", "sure_tral"]]
    result = result.groupby(by="record").max()
    result.to_csv("out.txt", sep=' ', index=True, header=False)

In [56]:
# раскомменитровать для использования
clf = get_result(1, 1, filter_equal, filter_equal)
# validate(clf):

  source_df = load()


0.9690267545560296


In [57]:
def tree_analyze():
    source_df = load()
    train_frac = 0.1
    train_filter_fn = filter_equal
    train_df = proccessing(source_df, frac=train_frac)
    train_df=train_filter_fn(train_df)
    df_train_x, df_train_y = split(train_df)
    clf = learn(df_train_x, df_train_y)
    text_representation = tree.export_text(clf)
    print(text_representation)
    return train_df

def dataset_analyze(train_df):
    not_trall = train_df[train_df['sure_tral'] == 0]
    trall = train_df[train_df['sure_tral'] == 1]

    f, (ax1, ax2) = plt.subplots(1, 2, sharex='all', sharey='all')

    ax1.scatter(not_trall['course'], not_trall['velocity'], c='#0000FF', label='not_trall', alpha=0.5)
    plt.xlabel('course')
    plt.ylabel('velocity')
    ax1.legend()
    ax2.scatter(trall['course'], trall['velocity'], c='#00FF00', label='trall', alpha=0.5)
    ax2.legend()
    
    plt.rcParams["figure.figsize"] = (12,12)
    plt.savefig('fig1.png', dpi=300)
   #  plt.show();

In [58]:
# раскомменитровать для использования
# train_df = tree_analyze()

In [59]:
# раскомменитровать для использования
# dataset_analyze(train_df)

In [60]:
def get_linear_plot():
    source_df = load()
    train_frac = 0.1
    train_filter_fn = filter_equal
    clear_df = proccessing(source_df, frac=train_frac)
    clear_df=train_filter_fn(clear_df)

    not_trall = clear_df[clear_df['sure_tral'] == 0]
    trall = clear_df[clear_df['sure_tral'] == 1]
    
    not_trall['velocity'] = not_trall['velocity'].astype(str)
    trall['velocity'] = trall['velocity'].astype(str)
    
    trall_unique = trall['velocity'].value_counts(dropna=False)
    trall_values = trall_unique.to_numpy()
    trall_indexes = trall_unique.index.to_numpy()
    trall_count = np.array((trall_values, trall_indexes)).transpose()
    trall_count = trall_count[trall_count[:, 1].argsort()]

    not_trall_unique = not_trall['velocity'].value_counts(dropna=False)
    not_trall_values = not_trall_unique.to_numpy()
    not_trall_indexes = not_trall_unique.index.to_numpy()
    not_trall_count = np.array((not_trall_values, not_trall_indexes)).transpose()
    
    velocity_dict = {}
    for i in range(not_trall_count.shape[0]):
        velocity_dict[not_trall_count[i, 1]] = [not_trall_count[i, 0], 0]

    for i in range(trall_count.shape[0]):
        key = str(trall_count[i, 1])
        if key in velocity_dict.keys():
            velocity_dict[key] = [velocity_dict[key][0], int(trall_count[i, 0])]
        else:
            velocity_dict[key] = [0, int(trall_count[i, 0])]
    
    velocities = np.zeros((len(velocity_dict)))
    trall_velicities = np.zeros((len(velocity_dict)))
    not_trall_velicities = np.zeros((len(velocity_dict)))
    list_keys = list(velocity_dict.keys())

    for i in range(len(list_keys)):
        velocities[i] = float(list_keys[i])
        not_trall_velicities [i] = velocity_dict[list_keys[i]][0]
        trall_velicities [i] = velocity_dict[list_keys[i]][1]
        
    # A python dictionary
    data = {"trall": trall_velicities,

            "not_trall": not_trall_velicities

            };

    # Dictionary loaded into a DataFrame       
    dataFrame = pd.DataFrame(data=data, index = velocities);
    dataFrame = dataFrame.sort_index()

    # Draw a vertical bar chart
    dataFrame.plot.bar(rot=70, title="");

    # from matplotlib.pyplot import figure
    plt.rcParams["figure.figsize"] = (25,25)

    plt.savefig('fig2.png', dpi=300)
    plt.show(block=True);

In [61]:
# раскомменитровать для использования
# get_linear_plot()

In [62]:
# для рисования областей аномалий
def run_and_plot(clf, X, outliers_fraction, draw_legend=True, title=''):
    
    clf.fit(X)
    
    a_prob =  clf.decision_function(X)
    threshold = stats.scoreatpercentile(a_prob, 100 * outliers_fraction)
    
    #print (a_prob)
    
    # print ('ошибка  = ' + str( (clf.predict(X) != y).mean()))
    
    xx, yy = np.meshgrid(np.linspace(0, 360, 500), np.linspace(0, 20, 500))
    
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    #print (Z)

    plt.figure(figsize=(5, 5))
    #plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 20), cmap=plt.cm.binary) # plt.cm.Blues_r cmap=plt.cm.Blues_r)
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), Z.max(), 20), cmap=plt.cm.binary) # plt.cm.Blues_r cmap=plt.cm.Blues_r)
    a_ = plt.contour(xx, yy, Z, levels=[threshold], linewidths=1, colors='yellow')
    #plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='#CCDDFF') # CCDDFF
    b_ = plt.scatter(X['course'], X['velocity'], c='white')
    # c_ = plt.scatter(X[y<0, 0], X[y<0, 1], c='red')
    plt.axis('tight')
    if draw_legend:
        plt.legend(
            # [a_.collections[0], b_, c_],
            [a_.collections[0], b_],
            [u'разделяющая поверхность', u'нормальные объекты', u'выбросы'],
            prop=matplotlib.font_manager.FontProperties(size=11), loc='upper right')
    plt.title(title)
    
    plt.rcParams["figure.figsize"] = (12,12)
    plt.savefig('fig3.png', dpi=300)

def find_anomaly():
    # get dataset not_trall and trall
    source_df = load()
    train_frac = 0.2
    train_filter_fn = filter_equal
    clear_df = proccessing(source_df, frac=train_frac)
    clear_df=train_filter_fn(clear_df)

    not_trall = clear_df[clear_df['sure_tral'] == 0]
    trall = clear_df[clear_df['sure_tral'] == 1]

    not_trall = not_trall[not_trall.velocity.between(0, 20)]
    not_trall = not_trall[['course', 'velocity']]
    not_trall.head
    
    n = 150
    outliers_fraction = 0.1
    clf  = IsolationForest(n_estimators=n, max_samples=0.5, contamination='auto', max_features=2,
                           bootstrap=False, n_jobs=1, random_state=None, verbose=0, warm_start=True)

    run_and_plot(clf, not_trall, outliers_fraction=outliers_fraction, draw_legend=True, title='IForest, contamination=0.1, max_features=1.0')


In [63]:
# раскомменитровать для использования
# find_anomaly()

In [64]:
def plot_trajectory(source_df, ship=None, record=None):
    if not ship is None:
        selected_df = source_df[source_df['ship'] == ship]
    if not record is None:
        selected_df = selected_df[selected_df['record'] == record]

    # train_frac = 0.1
    # train_filter_fn = filter_equal
    # clear_df = proccessing(source_df, frac=train_frac)
    # clear_df=train_filter_fn(clear_df)

    not_trall = selected_df[selected_df['sure_tral'] == 0]
    trall = selected_df[selected_df['sure_tral'] == 1]

    # print(not_trall.head)
    # print(trall.head)
    # 8182    -8263    
    # 8264    -8271    
    f, ax1 = plt.subplots(1, 1)
    ax1.scatter(not_trall['latitude'], not_trall['longitude'], c='#0000FF', label='not_tral', alpha=0.5)
    ax1.scatter(trall['latitude'], trall['longitude'], c='#00FF00', label='tral', alpha=0.5)
    ax1.legend()
    plt.rcParams["figure.figsize"] = (3,3)
    plt.savefig('trajectory.png', dpi=300)
    

In [65]:
# Раскомменитровать для использования
# df = load()
# plot_trajectory(df, 1)

In [66]:
def some_fn():
    # train_frac = 1.0
    # clear_df = proccessing(source_df, frac=1.0)
    head = source_df.head(1000_000)
    head = proccessing(head, frac=None)

    grouped = head.groupby(['record']).agg(latitude=pd.NamedAgg(column='latitude', aggfunc=lambda x: max(x)-min(x)),
                                          sure_tral=pd.NamedAgg(column='latitude', aggfunc=lambda x: max(x)))
    print(grouped.head)

    not_trall = grouped[grouped['sure_tral'] == 0]
    trall = grouped[grouped['sure_tral'] == 1]

    # print(not_trall.head)
    # print(trall.head)
    # 8182    -8263    
    # 8264    -8271    
    f, (ax1, ax2) = plt.subplots(1, 2)
    ax1.scatter(not_trall['velocity'], not_trall['sure_tral'], c='#0000FF', label='not_trall', alpha=0.5)
    ax1.scatter(trall['velocity'], trall['sure_tral'], c='#00FF00', label='trall', alpha=0.5)
    plt.xlabel('velocity')
    plt.ylabel('sure_tral')
    ax1.legend()

    ax2.scatter(not_trall['latitude'], not_trall['sure_tral'], c='#0000FF', label='not_trall', alpha=0.5)
    ax2.scatter(trall['latitude'], trall['sure_tral'], c='#00FF00', label='trall', alpha=0.5)
    plt.xlabel('latitude')
    plt.ylabel('sure_tral')
    ax1.legend()

In [67]:
def time_series_classification():
    source_df = load()
    df_clear = proccessing(df=source_df, frac=None)

    df1 = df_clear
    df1 = df1.rename(columns={'ship': 'ship1', 'record': 'record1', 'time': 'time1', 
                              'latitude': 'latitude1', 'longitude': 'longitude1', 
                              'course': 'course1', 'velocity': 'velocity1', 'sure_tral': 'sure_tral1'})
    df2 = df1.shift(-1)
    df2 = df2.rename(columns={'ship1': 'ship2', 'record1': 'record2', 'time1': 'time2', 
                              'latitude1': 'latitude2', 'longitude1': 'longitude2', 
                              'course1': 'course2', 'velocity1': 'velocity2', 'sure_tral1': 'sure_tral2'})
    df3 = df2.shift(-1)
    df3 = df3.rename(columns={'ship2': 'ship3', 'record2': 'record3', 'time2': 'time3', 
                              'latitude2': 'latitude3', 'longitude2': 'longitude3', 
                              'course2': 'course3', 'velocity2': 'velocity3', 'sure_tral2': 'sure_tral3'})
    df4 = df3.shift(-1)
    df4 = df4.rename(columns={'ship3': 'ship4', 'record3': 'record4', 'time3': 'time4', 
                              'latitude3': 'latitude4', 'longitude3': 'longitude4', 
                              'course3': 'course4', 'velocity3': 'velocity4', 'sure_tral3': 'sure_tral4'})
    df5 = df4.shift(-1)
    df5 = df5.rename(columns={'ship4': 'ship5', 'record4': 'record5', 'time4': 'time5', 
                              'latitude4': 'latitude5', 'longitude4': 'longitude5', 
                              'course4': 'course5', 'velocity4': 'velocity5', 'sure_tral4': 'sure_tral5'})
    df = pd.concat((df1, df2, df3, df4, df5), axis=1)
    df = df[df['record1'] == df['record2']]
    df = df[df['record2'] == df['record3']]
    df = df[df['record3'] == df['record4']]
    df = df[df['record4'] == df['record5']]
    
    # print(df.shape)
    
    work_df = df[['ship1', 'record1', 'time1', 'time2', 'time3', 'time4', 'time5', 
                  'latitude1', 'latitude2', 'latitude3', 'latitude4', 'latitude5',
                 'longitude1', 'longitude2', 'longitude3', 'longitude4', 'longitude5',
                 'course1', 'course2', 'course3', 'course4', 'course5',
                 # 'velocity1', 'velocity2', 'velocity3', 'velocity4', 'velocity5',
                 'sure_tral1']]
    work_df = work_df.rename(columns={'sure_tral1': 'sure_tral'})
    
    work_df['latitude5'] = work_df['latitude5'] - work_df['latitude1']
    work_df['latitude4'] = work_df['latitude4'] - work_df['latitude1']
    work_df['latitude3'] = work_df['latitude3'] - work_df['latitude1']
    work_df['latitude2'] = work_df['latitude2'] - work_df['latitude1']
    work_df['latitude1'] = work_df['latitude1'] - work_df['latitude1']

    work_df['longitude5'] = work_df['longitude5'] - work_df['longitude1']
    work_df['longitude4'] = work_df['longitude4'] - work_df['longitude1']
    work_df['longitude3'] = work_df['longitude3'] - work_df['longitude1']
    work_df['longitude2'] = work_df['longitude2'] - work_df['longitude1']
    work_df['longitude1'] = work_df['longitude1'] - work_df['longitude1']

    work_df['course5'] = work_df['course5'] - work_df['course1']
    work_df['course4'] = work_df['course4'] - work_df['course1']
    work_df['course3'] = work_df['course3'] - work_df['course1']
    work_df['course2'] = work_df['course2'] - work_df['course1']
    work_df['course1'] = work_df['course1'] - work_df['course1']

    # print(work_df.head)
    # df_sampled = work_df.sample(frac=0.1)
    # print(df_sampled.head)
    
    # Обучение и тестирование на сбалансированной выборке
    train_df=filter_equal(work_df, frac=1)
    df_train_x = train_df[[ 
                  'latitude1', 'latitude2', 'latitude3', 'latitude4', 'latitude5',
                 'longitude1', 'longitude2', 'longitude3', 'longitude4', 'longitude5',
                 'course1', 'course2', 'course3', 'course4', 'course5',
                 # 'velocity1', 'velocity2', 'velocity3', 'velocity4', 'velocity5'
    ]]
    df_train_y = train_df['sure_tral']
    
    clf = tree.DecisionTreeClassifier()
    learning_result = cross_validate(clf, df_train_x, df_train_y, return_estimator=True)
    bal_scores = learning_result['test_score']
    bal_score = bal_scores.mean()
    trained_clf = learning_result['estimator'][0]
    print('Balanced score', bal_score)
    
    return trained_clf, work_df

def inaccuracy_decode(clf, df):
    # Расшифровка типов ошибок
    test_df=filter_equal(df)
    df_test_x = test_df[[
                  'latitude1', 'latitude2', 'latitude3', 'latitude4', 'latitude5',
                 'longitude1', 'longitude2', 'longitude3', 'longitude4', 'longitude5',
                 'course1', 'course2', 'course3', 'course4', 'course5',
                 # 'velocity1', 'velocity2', 'velocity3', 'velocity4', 'velocity5'
    ]]
    df_test_y = test_df[['sure_tral']]

    y_predict = clf.predict(df_test_x)
    y_real = df_test_y.to_numpy().reshape((1, -1))

    diff1 = np.sum(y_predict < y_real) # Когда мы не заметили, что он рыбачил
    diff2 = np.sum(y_predict > y_real) # Когда мы ложно заподозрили
    diff3 = np.sum(y_predict == y_real) # Когда предсказание совпало с датасетом
    # print('diff', diff)
    print('Расшифровка предсказаний для отдельных строк:')
    value1 = diff1 / len(y_predict)
    print('По отчетности вылов был, а система выдала его отсутствие', value1)
    value2 = diff2 / len(y_predict)
    print('По отчётности вылова не было, а система заподозрила его наличие', value2)
    value3 = diff3 / len(y_predict)
    print('Данные по отчётам и ответ системы совпали', value3)
    test_df['y_pred'] = clf.predict(test_df[[ 
              'latitude1', 'latitude2', 'latitude3', 'latitude4', 'latitude5',
             'longitude1', 'longitude2', 'longitude3', 'longitude4', 'longitude5',
             'course1', 'course2', 'course3', 'course4', 'course5',
             # 'velocity1', 'velocity2', 'velocity3', 'velocity4', 'velocity5'
    ]])
    grouped_df_result = test_df.groupby(['record1']).agg(y_real=pd.NamedAgg(column='sure_tral', aggfunc=lambda x: max(x)),
                                          y_pred=pd.NamedAgg(column='y_pred', aggfunc=lambda x: max(x)))
    r_diff1 = np.sum(grouped_df_result['y_pred'] < grouped_df_result['y_real']) # Когда мы не заметили, что он рыбачил
    r_diff2 = np.sum(grouped_df_result['y_pred'] > grouped_df_result['y_real']) # Когда мы ложно заподозрили
    r_diff3 = np.sum(grouped_df_result['y_pred'] == grouped_df_result['y_real']) # Когда предсказание совпало с датасетом
    r_value1 = r_diff1 / len(grouped_df_result)
    print('Расшифровка предсказаний для дней целиком:')
    print('По отчетности вылов был, а система выдала его отсутствие', r_value1)
    r_value2 = r_diff2 / len(grouped_df_result)
    print('По отчётности вылова не было, а система заподозрила его наличие', r_value2)
    r_value3 = r_diff3 / len(grouped_df_result)
    print('Данные по отчётам и ответ системы совпали', r_value3)

In [68]:
# Раскомменитровать для использования
# trained_clf, work_df = time_series_classification()
clf = get_result(1, 1, filter_equal, filter_equal)

  source_df = load()


0.9689336952307096


In [69]:
def proc_for_predict(df):
    df_sampled = df
    df_clear = df_sampled[df_sampled.velocity != 'None']
    df_clear = df_clear[df_clear.course != 'None']
    df_clear['velocity'] = df_clear['velocity'].astype(float)
    df_clear['course'] = df_clear['course'].astype(int)
    return df_clear

In [70]:
def predict(df, clf):
    full_df = df
    predict_df = proc_for_predict(df)
    predict_x = predict_df[['course', 'velocity']]
    predict_result = clf.predict(predict_x)
    predict_df["predict"] = predict_result
    predict_se = predict_df["predict"]

    full_df["predict"] = predict_se
    full_df["predict"] = full_df["predict"].fillna(0.49)
    return full_df

In [71]:
def predict_mean(df):
    print("Находим средние значения за день: ")
    df[["predict_mean"]] = -1
    df[["predict_round_mean"]] = -1
    uniques = df["record"].unique()
    for unique in tqdm(uniques):
        test = df[df["record"] == unique]
        mean = test["predict"].mean()
        df.loc[df["record"] == unique, "predict_mean"] = mean
        mean = round(mean)
        df.loc[df["record"] == unique, "predict_round_mean"] = mean
    return df

In [72]:
def full_predict(df, clf):
    df = predict(df, clf)
    df = predict_mean(df)
    return df

In [73]:
# Раскомменитровать для использования
# inaccuracy_decode(trained_clf, work_df)

In [80]:
train = load()
df = full_predict(train, clf)

# valid = pd.read_csv("2people_with_txt_samples/control.csv", decimal='.')
# df = full_predict(valid, clf)

  train = load()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df["predict"] = predict_se
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df["predict"] = full_df["predict"].fillna(0.49)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value


Находим средние значения за день: 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
100%|██████████| 606/606 [00:02<00:00, 219.47it/s]


In [81]:
import ipywidgets as widgets

In [82]:
ships = df["ship"].unique()
selected_ship = widgets.Select(
    options=ships,
    value=ships[0],
    rows=10,
    description="Корабли:",
    disabled=False
)
first_ship_records = df.loc[df["ship"] == selected_ship.value, "record"].unique()
selected_rec = widgets.Select(
    options=first_ship_records,
    value=first_ship_records[0],
    rows=10,
    description="Записи:",
    disabled=False
)
fishing = widgets.Label(value="")
report = widgets.Button(
    description="Отчет",
    disabled='sure_tral' not in df.columns,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    icon='check' # (FontAwesome names without the `fa-` prefix)
)

output2 = widgets.Output(
    layout=widgets.Layout(width='calc(100% + 14ex)')
)

In [83]:
def on_selected_ship_change(change):
    with output2:
        selected_rec.options = df.loc[df["ship"] == selected_ship.value, "record"].unique()
        output2.clear_output()
        
def on_selected_record_change(change):
    with output2:
        output2.clear_output()
        rec = df[df["ship"] == selected_ship.value][df["record"] == selected_rec.value]
        if rec["predict_round_mean"].mean() == 1:
            fishing.value = "Рыбачит!"
        else:
            fishing.value = "Отдыхает!"
        print(rec)
        

def report_button(b):
    with output2:
        output2.clear_output()
        recs = df[df["sure_tral"] == 0]
        recs = recs[recs["predict_round_mean"] == 1]
        recs = recs.groupby(by="record").first()
        recs = recs.groupby(by="ship").mean()
        recs = recs.nlargest(100, "predict_mean")
        print(len(recs))
        for i in recs.index.array:
            print("Корабль: ", i, "   Шансы: ", recs.loc[i]["predict_mean"])
        

report.on_click(report_button)
selected_ship.observe(on_selected_ship_change, names='value')
selected_rec.observe(on_selected_record_change, names='value')

In [84]:
display(widgets.HBox([selected_ship, selected_rec, widgets.VBox([fishing, report])]), output2)

HBox(children=(Select(description='Корабли:', options=(1, 2, 3), rows=10, value=1), Select(description='Записи…

Output(layout=Layout(width='calc(100% + 14ex)'))