In [1]:
import gc
import os
import pathlib

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

In [None]:
%matplotlib inline

In [None]:
!ls '../input'

In [None]:
train_df = pd.read_csv('../input/train/train.csv')

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
acoustic_data_df = train_df[train_df.columns.drop('time_to_failure')].astype(np.int32)

In [None]:
time_to_failure_df = train_df[train_df.columns.drop('acoustic_data')].astype(np.float32)

In [None]:
compressed_train_df = pd.concat([acoustic_data_df, time_to_failure_df], axis=1)

In [None]:
compressed_train_df.head()

In [None]:
compressed_train_df.info()

In [None]:
train_df = compressed_train_df

In [None]:
train_df.shape

In [None]:
#del acoustic_data_df 
#del time_to_failure_df
gc.collect()

In [None]:
train_df.info()

In [None]:
unique_acoustic_data = train_df['acoustic_data'].unique()

In [None]:
unique_time_to_failure = train_df['time_to_failure'].unique()

In [None]:
unique_acoustic_data.shape

In [None]:
unique_time_to_failure.shape

In [None]:
train_df.shape

In [None]:
fig = plt.figure(figsize=(24, 18))

In [None]:
ax = fig.add_subplot(111)

In [None]:
ax.set_xlim(train_df['acoustic_data'].min(), train_df['acoustic_data'].max())

In [None]:
train_df.hist(column='acoustic_data', bins=200, figsize=(24, 18))

In [None]:
train_df['acoustic_data'].count()

In [None]:
#grouped_train_df_by_acoustic = train_df[train_df.columns.drop('time_to_failure')].groupby('acoustic_data')
grouped_train_series_by_acoustic = train_df['acoustic_data'].groupby(by=train_df['acoustic_data'])

In [None]:
#dir(grouped_train_series_by_acoustic)

In [None]:
count_train_series_acoustic = grouped_train_series_by_acoustic.count()

In [None]:
print(count_train_series_acoustic.memory_usage())

In [None]:
count_train_series_acoustic.shape

In [None]:
count_train_series_acoustic.head()

In [None]:
#count_train_series_acoustic[-5008]

In [None]:
count_acoustic_more_20 = count_train_series_acoustic[count_train_series_acoustic > 20]

In [None]:
print(count_acoustic_more_20.memory_usage())

In [None]:
count_acoustic_more_20.shape

In [None]:
print(count_acoustic_more_20.min())
print(count_acoustic_more_20.max())

In [None]:
#fig = plt.figure(figsize=(24, 18))
#ax = fig.add_subplot(111)
#ax.set_xlim(count_acoustic_more_20.min(), count_acoustic_more_20.max())

In [None]:
count_acoustic_more_20.mode()

In [None]:
count_acoustic_more_20.tail()

In [None]:
#count_acoustic_more_20[73883469]

In [None]:
index_count_acoustic_more_20 = count_acoustic_more_20.index

In [None]:
print(index_count_acoustic_more_20[:10])
print(index_count_acoustic_more_20[-10:])

In [None]:
print(index_count_acoustic_more_20.min())
print(index_count_acoustic_more_20.max())

In [None]:
fig = plt.figure(figsize=(24, 18))
ax = fig.add_subplot(111)
ax.set_xlim(index_count_acoustic_more_20.min(), index_count_acoustic_more_20.max())
#ax.hist(count_acoustic_more_20, bins=100, color='gray')
ax.plot(index_count_acoustic_more_20, count_acoustic_more_20, color='green')
ax.hist(count_acoustic_more_20, color='gray', bins=60)
plt.show()

In [None]:
fig = plt.figure(figsize=(24, 18))
ax = fig.add_subplot(111)
#ax.set_xlim(index_count_acoustic_more_20.min(), index_count_acoustic_more_20.max())
#ax.hist(count_acoustic_more_20, bins=100, color='gray')
#ax.plot(index_count_acoustic_more_20, count_acoustic_more_20, color='green')
ax.hist(count_acoustic_more_20, color='gray', bins=500)
plt.show()

In [None]:
fig = plt.figure(figsize=(24, 18))
ax = fig.add_subplot(111)
ax.set_xlim(index_count_acoustic_more_20.min(), index_count_acoustic_more_20.max())
#ax.hist(count_acoustic_more_20, bins=100, color='gray')
#ax.plot(index_count_acoustic_more_20, count_acoustic_more_20, color='green')
ax.scatter(index_count_acoustic_more_20, count_acoustic_more_20, color='black')
plt.show()

In [None]:
print(count_acoustic_more_20.unique())

In [None]:
print(count_acoustic_more_20.unique().shape)
print(count_acoustic_more_20.shape)

In [None]:
count_acoustic_more_40 = count_train_series_acoustic[count_train_series_acoustic > 40]

In [None]:
print(count_acoustic_more_40.unique().shape)
print(count_acoustic_more_40.shape)
print(count_acoustic_more_40.unique())

In [None]:
fig = plt.figure(figsize=(24, 18))
ax = fig.add_subplot(111)
ax.set_xlim(count_acoustic_more_40.index.min(), count_acoustic_more_40.index.max())
#ax.hist(count_acoustic_more_20, bins=100, color='gray')
#ax.plot(index_count_acoustic_more_20, count_acoustic_more_20, color='green')
ax.scatter(count_acoustic_more_40.index, count_acoustic_more_40, color='black')
plt.show()

In [None]:
print("count_train_series_acoustic.min(): ", count_train_series_acoustic.min())
print("count_train_series_acoustic.max(): ", count_train_series_acoustic.max())
print("count_train_df_acoustic.index.min(): ", count_train_series_acoustic.index.min())
print("count_train_df_acoustic.index.max(): ", count_train_series_acoustic.index.max())
print("count_train_series_acoustic.mode(): ", count_train_series_acoustic.mode())
print("count_train_series_acoustic.median(): ", count_train_series_acoustic.median())
print("count_train_series_acoustic.mean(): ", count_train_series_acoustic.mean())
#print("count_train_series_acoustic.index.mode(): ", count_train_series_acoustic.index.mode())
#print("count_train_series_acoustic.index.median(): ", count_train_series_acoustic.index.median())
#print("count_train_series_acoustic.index.mean(): ", count_train_series_acoustic.index.mean())

In [None]:
count_train_series_acoustic.mode()

In [None]:
count_train_series_acoustic[count_train_series_acoustic == 0].shape

In [None]:
count_train_series_acoustic[count_train_series_acoustic == 1].shape

In [None]:
#count_train_series_acoustic.mode?

In [None]:
count_train_series_acoustic.head()

In [None]:
gc.collect()

In [None]:
train_df.head()

In [None]:
zero_time_df = train_df[train_df['time_to_failure'] == 0]

In [None]:
zero_time_df.shape

In [None]:
train_df['time_to_failure'].min()

In [None]:
zero_time_df =  train_df[train_df['time_to_failure'] == train_df['time_to_failure'].min()]

In [None]:
zero_time_df.shape

In [None]:
zero_time_df =  train_df[train_df['time_to_failure'] <= 0.0001]

In [None]:
zero_time_df.index

In [None]:
plt.figure(figsize=(18, 16))
plt.plot(zero_time_df['acoustic_data'].values, zero_time_df['time_to_failure'].values)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(18, 16))
plt.plot(train_df[train_df['time_to_failure'] <= 0.01].index, train_df[train_df['time_to_failure'] <= 0.01]['acoustic_data'].values)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(18, 16))
plt.plot(train_df[train_df['time_to_failure'] <= 0.01].index, train_df[train_df['time_to_failure'] <= 0.01]['time_to_failure'].values)
plt.grid()
plt.show()

In [2]:
np.diff?

In [None]:
time_to_failure_diffs = np.diff(train_df['time_to_failure']).astype(np.float32)

In [None]:
time_to_failure_diffs.shape[0] == train_df.shape[0]

In [None]:
time_to_failure_diffs.shape

In [None]:
train_df.shape

In [None]:
print(time_to_failure_diffs[:10])
print(time_to_failure_diffs[-10:])

In [None]:
time_to_failure_diffs = np.concatenate((time_to_failure_diffs, [0])).astype(np.float32)

In [None]:
time_to_failure_diffs.shape

In [None]:
train_df['time_to_failure_diffs'] = time_to_failure_diffs

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
del time_to_failure_diffs
gc.collect()

In [None]:
(train_df['time_to_failure_diffs'] == 0).count()

In [None]:
(train_df['time_to_failure_diffs'] == np.inf).any()

In [None]:
(train_df['time_to_failure_diffs'] == np.nan).any()

In [None]:
((train_df['time_to_failure_diffs'] != 0.0).astype(np.int32)).sum()

In [None]:
plt.figure(figsize=(16,6))
features = train_df.columns.values
plt.title("Distribution of mean values per row in the train and test set")
sns.distplot(train_df[features].mean(axis=1),color="green", kde=True,bins=120, label='train')
#sns.distplot(test_df[features].mean(axis=1),color="blue", kde=True,bins=120, label='test')
plt.legend()
plt.show()