In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout
from tensorflow.python.ops.numpy_ops import np_utils

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.manifold import TSNE

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import os
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
np.random.seed(32)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [None]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()


### Let's check 1st file: /kaggle/input/Tags.csv

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# Tags.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('/kaggle/input/topic-modeling-for-research-articles/Tags.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'Tags.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df1.head(5)

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
plotPerColumnDistribution(df1, 10, 5)

### Let's check 2nd file: /kaggle/input/Test.csv

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# Test.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df2 = pd.read_csv('/kaggle/input/topic-modeling-for-research-articles/Test.csv', delimiter=',', nrows = nRowsRead)
df2.dataframeName = 'Test.csv'
nRow, nCol = df2.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df2

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
plotPerColumnDistribution(df2, 10, 5)

Correlation matrix:

In [None]:
plotCorrelationMatrix(df2, 8)

Scatter and density plots:

In [None]:
plotScatterMatrix(df2, 15, 10)

### Let's check 3rd file: /kaggle/input/Train.csv

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# Train.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df3 = pd.read_csv('/kaggle/input/topic-modeling-for-research-articles/Train.csv', delimiter=',', nrows = nRowsRead)
df3.dataframeName = 'Train.csv'
nRow, nCol = df3.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df3.head(5)

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
plotPerColumnDistribution(df3, 10, 5)

Correlation matrix:

In [None]:
plotCorrelationMatrix(df3, 8)

Scatter and density plots:

In [None]:
plotScatterMatrix(df3, 20, 10)

In [None]:
for col in df3.columns:
    print(col)

In [None]:
df3.isnull().sum()

In [None]:
df3.isnull().values.any()

In [None]:
TARGET_COLS = ['Analysis of PDEs', 'Applications',
               'Artificial Intelligence', 'Astrophysics of Galaxies',
               'Computation and Language', 'Computer Vision and Pattern Recognition',
               'Cosmology and Nongalactic Astrophysics',
               'Data Structures and Algorithms', 'Differential Geometry',
               'Earth and Planetary Astrophysics', 'Fluid Dynamics',
               'Information Theory', 'Instrumentation and Methods for Astrophysics',
               'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
               'Optimization and Control', 'Representation Theory', 'Robotics',
               'Social and Information Networks', 'Statistics Theory',
               'Strongly Correlated Electrons', 'Superconductivity',
               'Systems and Control']

In [None]:
train_text = df3['ABSTRACT']
train_y = df3[TARGET_COLS]
test_text = df2['ABSTRACT']
test_y = df2.loc[:, df2.columns != 'ABSTRACT'].loc[:, df2.loc[:, df2.columns != 'ABSTRACT'].columns != 'id']


In [None]:
train_text.shape

In [None]:
MAX_NB_WORDS = 20000

# get the raw text data
texts_train = train_text.astype(str)
texts_test = test_text.astype(str)

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
sequences[0]

In [None]:
type(tokenizer.word_index), len(tokenizer.word_index)

In [None]:
index_to_word = dict((i, w) for w, i in tokenizer.word_index.items())

In [None]:
" ".join([index_to_word[i] for i in sequences[0]])

In [None]:
" ".join([index_to_word[i] for i in sequences[1]])

In [None]:
seq_lens = [len(s) for s in sequences]
print("average length: %0.1f" % np.mean(seq_lens))
print("max length: %d" % max(seq_lens))

In [None]:
plt.hist(seq_lens, bins=50);

In [None]:
plt.hist([l for l in seq_lens if l < 200], bins=50);

In [None]:
MAX_SEQUENCE_LENGTH = 150

# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data test tensor:', x_test.shape)


In [None]:
y_train = train_y
y_test = test_y
y_train = np.asarray(y_train)
y_train_new = np.array([])
"""for i in range(len(y_train)):
    y_train_new  (np.delete(y_train[i],[0,1,2,3]))
y_train = np.asarray(y_train_new)"""
print('Shape of label tensor:', y_train.shape)

y_test = np.asarray(y_test)
print('Shape of label tensor:', y_test.shape)

In [None]:
y_train[0]

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

y_train=y_train
import numpy as np
model = Sequential()
model.add(layers.Dense(10, input_dim=150,activation='relu'))
model.add(Dropout(0.5))
model.add(layers.Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(25, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(x_train, y_train,epochs=25,batch_size=32)

In [None]:
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding, BatchNormalization, Flatten
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=150))
model.add(Conv1D(32, 8, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 8, activation='relu'))
model.add(Dropout(0.6))
model.add(BatchNormalization())
model.add(Conv1D(512, 8, activation='relu'))
model.add(Dropout(0.7))
model.add(BatchNormalization())
model.add(MaxPooling1D())
model.add(Flatten())
model.add(layers.Dense(29, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

model.fit(x_train, y=y_train, batch_size=128,epochs=25, verbose=1)