In [4]:
import re
import os
import sys
import csv
import pickle
import math
import numpy as np
import pandas as pd
from random import seed
from random import randint
from nltk.stem import PorterStemmer
from IPython.display import display, HTML
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [5]:
StopWords=open("Stopword-List.txt")
StopWords=StopWords.readlines()
ps = PorterStemmer()

# FILTERING OF DATA

In [6]:
def GenerateTokensBySpace(File):
    Tokens=[]
    for Word in File:
        Token=""
        for Character in Word:
            if Character!=" ":
                Token+=Character
            else:
                Tokens.append(Token)
                Token=""
    return Tokens
def RemovingDots(Tokens):
    Result=[]
    for Token in Tokens:
        if Token.count(".")>=2: #For Initials like U.S.A
            Result.append(Token.replace(".",""))
        else:
            SplitByDot=re.split("\.",Token) # For Words Like Thousands.So
            for Word in SplitByDot:
                if Word!="":
                    Result.append(Word)
    return Result

def RemovingContractions(Tokens):
    Result=[]
    for Token in Tokens:
        Word=Token.replace("?","").replace(":","").replace(",","").replace('"',"")
        Word=re.split(r"n't",Word)
        if len(Word)>1:
            Word[1]="not"
        if len(Word)<2:
            Word=re.split(r"'s",Word[0])
            if len(Word)>1:
                Word[1]="is"
        if len(Word)<2:
            Word=re.split(r"'re",Word[0])
            if len(Word)>1:
                Word[1]="are"
        if len(Word)<2:
            Word=re.split(r"'m",Word[0])
            if len(Word)>1:
                Word[1]="am"
        if len(Word)<2:
            Word=re.split(r"'ll",Word[0])
            if len(Word)>1:
                Word[1]="will"
        if len(Word)<2:
            Word=re.split(r"'ve",Word[0])
            if len(Word)>1:
                Word[1]="have"
        if len(Word)<2:
            Word=re.split(r"'d",Word[0])
            if len(Word)>1:
                Word[1]="had"
        for W in Word:
            if W!="":
                Result.append(W)
    return Result

def LOWERCASECONVERTOR(Tokens):
    Result=[]
    for Token in Tokens:
        Result.append(Token.lower())
    return Result

def RemovingBraces(Tokens): #[]
    Result=[]
    for Token in Tokens:
        Words=re.split(r"\[(\w+)\]",Token)
        for Word in Words:
            if Word!="":
                Result.append(Word)
    return Result

def RemovingHypens(Tokens):
    Result=[]
    for Token in Tokens:
        Words=re.split(r"\-",Token)
        for Word in Words:
            if Word!="":
                Result.append(Word)
    return Result

def PorterStemming(Tokens):
    Result=[]
    for Token in Tokens:
        Result.append(ps.stem(Token))
    return Result

def GeneratingStopWordsList(File):
    StopWordList=[]
    for word in StopWords:
        word=re.split("\\n",word)
        if word[0]!="":
            StopWordList.append(word[0].replace(" ",""))
    return StopWordList


def RemovingStopWords(Tokens,StopWordList):
    Result=[]
    for Token in Tokens:
        if Token not in StopWordList:
            Result.append(Token)
    return Result

def FinalFilter(SortedKeys):
    
    NewKeys=[]
    
    for i in SortedKeys:
        i=i.replace("'","").replace(";","").replace(")","").replace("(","").replace("[","").replace("]","").replace("Ã¢Â","").replace("Ã¢Â","")
        NewKeys.append(i)
    while "" in NewKeys:
        NewKeys.remove("")
        
    return NewKeys

## Generating Dictionary Keys

In [7]:
def GeneratePostingList(directory):
    
    Dictionary={}
    
    IndexOfFile=0

    Folders=next(os.walk(directory))[1]

    for Folder in Folders:
    
        Files=next(os.walk(directory+Folder))[2]
    
        for FileName in Files:
        
            File=open(directory+Folder+'/'+FileName)
        
            Speech=File.readlines()
        
            # Filtering Data
        
            StopWordList=GeneratingStopWordsList(StopWords)
        
            Tokens=GenerateTokensBySpace(Speech)
        
            Tokens=RemovingContractions(Tokens)
        
            Tokens=RemovingDots(Tokens)
        
            Tokens=LOWERCASECONVERTOR(Tokens)
        
            Tokens=RemovingBraces(Tokens)
        
            Tokens=RemovingHypens(Tokens)
        
            Tokens=FinalFilter(Tokens)
        
            Tokens=RemovingStopWords(Tokens,StopWordList)
        
            Tokens=PorterStemming(Tokens)
        
        
            for i in range(0,len(Tokens)):
                
                Dictionary.setdefault(Tokens[i],{})
        
            IndexOfFile+=1
        
    return Dictionary

In [8]:
Dictionary=GeneratePostingList('bbcsport-fulltext/bbcsport/')
SortedKeys=sorted(Dictionary)

# Train Data set Information

In [9]:
def GetTotalDocuments(directory):
    TotalTrainDoc=0
    
    Folders=next(os.walk(directory))[1]

    for Folder in Folders:
    
        Files=next(os.walk(directory+Folder))[2]
    
        for FileName in Files:
            
            TotalTrainDoc+=1
    
    return TotalTrainDoc

def GetTotalFolder(directory):
    
    return len(next(os.walk('bbcsport-fulltext/bbcsport/'))[1])
  
def GetTotalFilesInClass(ClassName,directory):
    
    Files=next(os.walk(directory+ClassName))[2]
    
    return len(Files)

## Printing Train Data Set Information

In [10]:
print("*"*124)
print("Total Train Folders          : ",GetTotalFolder('bbcsport-fulltext/bbcsport/'))
print("Total Train Documents        : ",GetTotalDocuments('bbcsport-fulltext/bbcsport/'))
print("Documents In Athletics Class : ",GetTotalFilesInClass('athletics','bbcsport-fulltext/bbcsport/'))
print("Documents In Cricket Class   : ",GetTotalFilesInClass('cricket','bbcsport-fulltext/bbcsport/'))
print("Documents In Football Class  : ",GetTotalFilesInClass('football','bbcsport-fulltext/bbcsport/'))
print("Documents In Rugby Class     : ",GetTotalFilesInClass('rugby','bbcsport-fulltext/bbcsport/'))
print("Documents In Tennis Class    : ",GetTotalFilesInClass('tennis','bbcsport-fulltext/bbcsport/'))
print("*"*124)
TotalTrainDocuments=GetTotalDocuments('bbcsport-fulltext/bbcsport/')

****************************************************************************************************************************
Total Train Folders          :  5
Total Train Documents        :  517
Documents In Athletics Class :  71
Documents In Cricket Class   :  87
Documents In Football Class  :  186
Documents In Rugby Class     :  103
Documents In Tennis Class    :  70
****************************************************************************************************************************


# Generating Train Data Set Files

In [11]:
def GeneratingTrainCsvFile(Dictionary):
    
    SortedKeys=sorted(Dictionary) 
    
    MatrixForTrainDataset=[[0 for i in range(0,len(SortedKeys)+1)]for j in range(0,TotalTrainDocuments)]

    TermIdfMatrix=[[0 for i in range(0,len(SortedKeys)+1)]for j in range(0,TotalTrainDocuments)]
    
    IndexOfFile=0

    Folders=next(os.walk('bbcsport-fulltext/bbcsport/'))[1]

    for Folder in Folders:
    
        Files=next(os.walk('bbcsport-fulltext/bbcsport/'+Folder))[2]
    
        for FileName in Files:
        
            File=open("bbcsport-fulltext/bbcsport/"+Folder+'/'+FileName)
        
            Speech=File.readlines()
        
            # Filtering Data
        
            StopWordList=GeneratingStopWordsList(StopWords)
        
            Tokens=GenerateTokensBySpace(Speech)
        
            Tokens=RemovingContractions(Tokens)
        
            Tokens=RemovingDots(Tokens)
        
            Tokens=LOWERCASECONVERTOR(Tokens)
        
            Tokens=RemovingBraces(Tokens)
        
            Tokens=RemovingHypens(Tokens)
            
            Tokens=FinalFilter(Tokens)
        
            Tokens=RemovingStopWords(Tokens,StopWordList)
        
            Tokens=PorterStemming(Tokens)
        
        
            for i in range(0,len(Tokens)):
            
                MatrixForTrainDataset[IndexOfFile][SortedKeys.index(Tokens[i])]+=1
            
                TermIdfMatrix[IndexOfFile][SortedKeys.index(Tokens[i])]=1
        
            MatrixForTrainDataset[IndexOfFile][len(MatrixForTrainDataset[IndexOfFile])-1]=Folder
        
            TermIdfMatrix[IndexOfFile][len(MatrixForTrainDataset[IndexOfFile])-1]=Folder
        
            IndexOfFile+=1
    
    
    for column in range(0,len(TermIdfMatrix[0])-1):
        
        NoOfDocumentForTerm=0
    
        for row in range(0,len(TermIdfMatrix)):
            
            NoOfDocumentForTerm+=TermIdfMatrix[row][column]
    
    
        for row in range(0,len(TermIdfMatrix)):
            
            if MatrixForTrainDataset[row][column]!=0:
                
                MatrixForTrainDataset[row][column]*=(math.log2(TotalTrainDocuments/NoOfDocumentForTerm))
    
    
    with open('traindata.p', 'wb') as fp:
        pickle.dump(MatrixForTrainDataset,fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    with open('IdfScores.p', 'wb') as fp:
        pickle.dump(TermIdfMatrix,fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    return TermIdfMatrix,MatrixForTrainDataset

In [12]:
try:
    print("*"*124)
    
    File = open("IdfScores.p","rb")
    TermIdfMatrix=pickle.load(File)
    
    File = open("traindata.p","rb")
    MatrixForTrainDataset=pickle.load(File)
    
    print("File Found")
    
    print("Printing Train Data .................")
    
    columns=SortedKeys.copy()
    
    columns.append("Class")
    
    File=pd.DataFrame(MatrixForTrainDataset,columns=columns)
    
    display(File)
    
    print("*"*124)
    
except:
    
    print("File Not Found")
    
    print("Creating Files .................")
    
    TermIdfMatrix,MatrixForTrainDataset=GeneratingTrainCsvFile(Dictionary)
    
    columns=SortedKeys.copy()
    
    columns.append("Class")
    
    File=pd.DataFrame(MatrixForTrainDataset,columns=columns)
    
    display(File)
    
    print("Files Created !")

****************************************************************************************************************************
File Found
Printing Train Data .................


Unnamed: 0,$1,$125000,$20m,$25m,$30000,&,&#1637m,&#1638m,0,00,...,â£6,â£600000,â£62m,â£65m,â£6m,â£70m,â£7m,â£7million,â£8m,Class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.474862,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,athletics
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,athletics
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,athletics
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,athletics
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,athletics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tennis
513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.474862,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tennis
514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.949723,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tennis
515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tennis


****************************************************************************************************************************


# Test Data Information

### Printing Test Data Information

In [13]:
print("*"*124)
print("Total Test Folders          : ",GetTotalFolder('Test/'))
print("Total Test Documents        : ",GetTotalDocuments('Test/'))
print("Documents In Athletics Class : ",GetTotalFilesInClass('athletics','Test/'))
print("Documents In Cricket Class   : ",GetTotalFilesInClass('cricket','Test/'))
print("Documents In Football Class  : ",GetTotalFilesInClass('football','Test/'))
print("Documents In Rugby Class     : ",GetTotalFilesInClass('rugby','Test/'))
print("Documents In Tennis Class    : ",GetTotalFilesInClass('tennis','Test/'))
print("*"*124)
TotalTestDocuments=GetTotalDocuments('Test/')

****************************************************************************************************************************
Total Test Folders          :  5
Total Test Documents        :  220
Documents In Athletics Class :  30
Documents In Cricket Class   :  37
Documents In Football Class  :  79
Documents In Rugby Class     :  44
Documents In Tennis Class    :  30
****************************************************************************************************************************


In [14]:
def GeneratingTestCsvFile(Dictionary,TermIdfMatrix):
    
    SortedKeys=sorted(Dictionary)
    
    MatrixForTestDataset=[[0 for i in range(0,len(SortedKeys)+1)]for j in range(0,TotalTestDocuments)]
    
    IndexOfFile=0

    Folders=next(os.walk('Test/'))[1]

    for Folder in Folders:
    
        Files=next(os.walk('Test/'+Folder))[2]
    
        for FileName in Files:
        
            File=open("Test/"+Folder+'/'+FileName)
        
            Speech=File.readlines()
        
            # Filtering Data
        
            StopWordList=GeneratingStopWordsList(StopWords)
        
            Tokens=GenerateTokensBySpace(Speech)
        
            Tokens=RemovingContractions(Tokens)
        
            Tokens=RemovingDots(Tokens)
        
            Tokens=LOWERCASECONVERTOR(Tokens)
        
            Tokens=RemovingBraces(Tokens)
        
            Tokens=RemovingHypens(Tokens)
            
            Tokens=FinalFilter(Tokens)
        
            Tokens=RemovingStopWords(Tokens,StopWordList)
        
            Tokens=PorterStemming(Tokens)
        
        
            for i in range(0,len(Tokens)):
                try:
                    MatrixForTestDataset[IndexOfFile][SortedKeys.index(Tokens[i])]+=1
                except:
                    pass
                
        
            MatrixForTestDataset[IndexOfFile][len(MatrixForTestDataset[IndexOfFile])-1]=Folder
        
            IndexOfFile+=1
      
    
    for column in range(0,len(TermIdfMatrix[0])-1):
    
        NoOfDocumentForTerm=0
    
        for row in range(0,len(TermIdfMatrix)):
        
            NoOfDocumentForTerm+=TermIdfMatrix[row][column]
    
    
        for row in range(0,len(MatrixForTestDataset)):
        
            if MatrixForTestDataset[row][column]!=0:
            
                MatrixForTestDataset[row][column]*=(math.log2(TotalTrainDocuments/NoOfDocumentForTerm))
    
    
    
    with open('testdata.p', 'wb') as fp:
        
        pickle.dump(MatrixForTestDataset,fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    return MatrixForTestDataset
        

In [15]:
try:
    print("*"*124)
    
    File = open("testdata.p","rb")
    
    MatrixForTestDataset=pickle.load(File)
    
    print("File Found")
    
    print("Printing Test Data .................")
    
    columns=SortedKeys.copy()
    
    columns.append("Class")
    
    File=pd.DataFrame(MatrixForTestDataset,columns=columns)
    
    display(File)
    
    print("*"*124)
    
except:
    
    print("File Not Found")
    
    print("Creating Files .................")
    
    MatrixForTestDataset=GeneratingTestCsvFile(Dictionary,TermIdfMatrix)
    
    columns=SortedKeys.copy()
    
    columns.append("Class")
    
    File=pd.DataFrame(MatrixForTestDataset,columns=columns)
    
    display(File)
    
    print("Files Created !")

****************************************************************************************************************************
File Found
Printing Test Data .................


Unnamed: 0,$1,$125000,$20m,$25m,$30000,&,&#1637m,&#1638m,0,00,...,â£6,â£600000,â£62m,â£65m,â£6m,â£70m,â£7m,â£7million,â£8m,Class
0,0,0,0,0.0,0,0.0,0,0,0.000000,0,...,0.0,0,0,0,0.0,0,0,0,0.0,athletics
1,0,0,0,0.0,0,0.0,0,0,0.000000,0,...,0.0,0,0,0,0.0,0,0,0,0.0,athletics
2,0,0,0,0.0,0,0.0,0,0,0.000000,0,...,0.0,0,0,0,0.0,0,0,0,0.0,athletics
3,0,0,0,0.0,0,0.0,0,0,0.000000,0,...,0.0,0,0,0,0.0,0,0,0,0.0,athletics
4,0,0,0,0.0,0,0.0,0,0,0.000000,0,...,0.0,0,0,0,0.0,0,0,0,0.0,athletics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,0,0,0,0.0,0,0.0,0,0,0.000000,0,...,0.0,0,0,0,0.0,0,0,0,0.0,tennis
216,0,0,0,0.0,0,0.0,0,0,2.474862,0,...,0.0,0,0,0,0.0,0,0,0,0.0,tennis
217,0,0,0,0.0,0,0.0,0,0,0.000000,0,...,0.0,0,0,0,0.0,0,0,0,0.0,tennis
218,0,0,0,0.0,0,0.0,0,0,0.000000,0,...,0.0,0,0,0,0.0,0,0,0,0.0,tennis


****************************************************************************************************************************


In [16]:
def GeneratingTestOutputCsvFile():
    
    print("*"*124)
    
    print("Generating Matrix To Be Saved in File")
    
    TrainDocIndex=0

    Output=[]

    Folders=next(os.walk('Test/'))[1]

    for Folder in Folders:
    
        Files=next(os.walk('Test/'+Folder))[2]
    
        for FileName in Files:
            
            l=[]
            
            l.append("DOCUMENT "+str(TrainDocIndex))
            
            l.append(Folder)
            
            Output.append(l)
        
            TrainDocIndex+=1
        
    
    Matrix=np.array(Output)
    
    print("Matrix Generated")
    
    columns=['Actual Output']
    
    columns.insert(0,' DOCUMENT INDEX ')
    
    print("Creating File................")
    
    File=pd.DataFrame(Matrix,columns=columns)
    
    File.to_csv("Test output.csv",index=False)
    
    print("File Created ")
    
    print("Printing Test Data Set ")
    
    display(File)
    
    print("*"*124)

In [17]:
GeneratingTestOutputCsvFile()

****************************************************************************************************************************
Generating Matrix To Be Saved in File
Matrix Generated
Creating File................
File Created 
Printing Test Data Set 


Unnamed: 0,DOCUMENT INDEX,Actual Output
0,DOCUMENT 0,athletics
1,DOCUMENT 1,athletics
2,DOCUMENT 2,athletics
3,DOCUMENT 3,athletics
4,DOCUMENT 4,athletics
...,...,...
215,DOCUMENT 215,tennis
216,DOCUMENT 216,tennis
217,DOCUMENT 217,tennis
218,DOCUMENT 218,tennis


****************************************************************************************************************************


# Loading Files

In [18]:
def LoadFiles():
    
#     train=pd.read_csv('Train.csv')
    
#     test=pd.read_csv('Test.csv')
    
    output=pd.read_csv('Test output.csv')
    
    with open('traindata.p', 'rb') as fp:
        MatrixForTrainDataset=pickle.load(fp)
    with open('testdata.p', 'rb') as fp:
        MatrixForTestDataset=pickle.load(fp)
    
    
#     return train,test,output,MatrixForTestDataset,MatrixForTrainDataset

    return output,MatrixForTestDataset,MatrixForTrainDataset

In [19]:
# train,test,output,MatrixForTestDataset,MatrixForTrainDataset=LoadFiles()
output,MatrixForTestDataset,MatrixForTrainDataset=LoadFiles()

# Generating Prediction File

In [20]:
def MakingPredictionFile(MatrixForTestDataset,MatrixForTrainDataset):
    
    index=0
    PredictCorrect=0
    PredictWrong=0
    ActualClass="athletics"
    TotalClassDocuments=0

    PredictedResult=[]

    for Test in range(0,len(MatrixForTestDataset)):
    
    
        Result={}
    
    
        ActualClass=MatrixForTestDataset[Test][len(MatrixForTestDataset[0])-1]
    
    
        Distance1=sys.maxsize
        Distance2=sys.maxsize
        Distance3=sys.maxsize
    
        Class1=""
        Class2=""
        Class3=""
    
        DocTest=MatrixForTestDataset[Test][1:len(MatrixForTestDataset[Test])-1]
    
        for Train in range(0,len(MatrixForTrainDataset)):
        
        
            ClassTrain=MatrixForTrainDataset[Train][len(MatrixForTrainDataset[0])-1]
        
            DocTrain=MatrixForTrainDataset[Train][1:len(MatrixForTrainDataset[Train])-1]
        
            distance = math.sqrt(sum([(float(a) - float(b)) ** 2 for a, b in zip(DocTest,DocTrain)]))
        
#         Result.setdefault(distance,ClassTrain)
    
    
            if distance<Distance1 and distance<Distance2 and distance<Distance3:
            
                Distance1=distance
                Class1=ClassTrain
            
            elif distance<Distance2 and distance<Distance3:
            
                Distance2=distance
                Class2=ClassTrain
            
            elif distance<Distance3:
            
                Distance3=distance
                Class3=ClassTrain
      
    
#     print([b for a, b in sorted(Result.items())][0:3])
    
        Result.setdefault(Distance3,Class3)
        Result.setdefault(Distance2,Class2)
        Result.setdefault(Distance1,Class1)
    
        index=0
    
        DistanceSortedResult={}
    
    
        for i in sorted(Result):
        
            if Result[i] in DistanceSortedResult.keys():
            
                j=DistanceSortedResult[Result[i]]
                j+=1
                DistanceSortedResult[Result[i]]=j
            
            else:
                DistanceSortedResult.setdefault(Result[i],1)
            index+=1
            if index==3:
                break
    
        PredictedClass=""
        for key,value in DistanceSortedResult.items():
            PredictedClass=key
            break

        PredictedResult.append(PredictedClass)
    
        if PredictedClass==ActualClass:
            PredictCorrect+=1
        else:
            PredictWrong+=1
        
        TotalClassDocuments+=1
    
    Prediction=[]
    docno=0
    for i in PredictedResult:
        l=[]
        doc="Document "+str(docno)
        l.append(doc)
        l.append(i)
        Prediction.append(l)
        docno+=1
    pd.DataFrame(Prediction,columns=["Document Index","Predicted Ouptut"]).to_csv("Predicted Output.csv",index=False)

In [21]:
try:
    print("*"*124)
    
    print("Prediction File Found")
    
    displayTrainDataset=pd.read_csv("Predicted Output.csv")
    
    print("Printing Output Data")
    
    display(displayTrainDataset)
    
    print("*"*124)
    
except:
    print("*"*124)
    print("File Not Found")
    print("Creating Files .................")
    MakingPredictionFile(MatrixForTestDataset,MatrixForTrainDataset)
    print("Files Created !")
    
    displayTrainDataset=pd.read_csv("Predicted Output.csv")
    
    print("Printing Output Data")
    
    display(displayTrainDataset)
    
    print("*"*124)
    
    


****************************************************************************************************************************
Prediction File Found
Printing Output Data


Unnamed: 0,Document Index,Predicted Ouptut
0,Document 0,athletics
1,Document 1,athletics
2,Document 2,athletics
3,Document 3,athletics
4,Document 4,athletics
...,...,...
215,Document 215,tennis
216,Document 216,tennis
217,Document 217,football
218,Document 218,football


****************************************************************************************************************************


## Prediction

In [22]:
TestOutput=pd.read_csv('Test output.csv')
PredictedOutput=pd.read_csv("Predicted Output.csv")

index=0
PredictCorrect=0
PredictWrong=0
ActualClass="athletics"
TotalClassDocuments=0
TotalPredictedCorrect=0

print(">"*123)
for i,y in zip(TestOutput['Actual Output'],PredictedOutput['Predicted Ouptut']):
    
    PreviousClass=ActualClass
    
    ActualClass=i
    
    AfterClass=ActualClass
    
    if PreviousClass!=AfterClass:
        
        print("-"*123)
        print(" "*50,PreviousClass.capitalize())
        print("Total Test Documents:   ",TotalClassDocuments)
        print("Predicted right:   ",PredictCorrect)
        print("Predicted wrong:   ",PredictWrong)
        print(PreviousClass.capitalize()," Class Accuracy:   ",(PredictCorrect/TotalClassDocuments)*100)
        print("-"*123)
        PredictCorrect=0
        PredictWrong=0
        TotalClassDocuments=0
    
    if i==y:
        TotalPredictedCorrect+=1
        PredictCorrect+=1
    else:
        PredictWrong+=1
        
    TotalClassDocuments+=1

print("-"*123)
print(" "*50,PreviousClass.capitalize())
print("Total Test Documents:   ",TotalClassDocuments)
print("Predicted right:   ",PredictCorrect)
print("Predicted wrong:   ",PredictWrong)
print(PreviousClass.capitalize()," Class Accuracy:   ",(PredictCorrect/TotalClassDocuments)*100)
print("-"*123)
print("Total Accuracy :          ",(TotalPredictedCorrect/len(MatrixForTestDataset))*100)
print(">"*123)

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
---------------------------------------------------------------------------------------------------------------------------
                                                   Athletics
Total Test Documents:    30
Predicted right:    28
Predicted wrong:    2
Athletics  Class Accuracy:    93.33333333333333
---------------------------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------
                                                   Cricket
Total Test Documents:    37
Predicted right:    30
Predicted wrong:    7
Cricket  Class Accuracy:    81.08108108108108
---------------------------------------------------------------------------------------------------------------------------
--------------------

# K-Means Clustering

In [23]:
def ClassDistributionOfTerms():
    
    Dictionary={}
    
    Folders=next(os.walk('bbcsport/'))[1]

    for Folder in Folders:
    
        Files=next(os.walk('bbcsport/'+Folder))[2]
    
        for FileName in Files:
        
            File=open("bbcsport/"+Folder+'/'+FileName)
        
            Speech=File.readlines()
        
            # Filtering Data
        
            StopWordList=GeneratingStopWordsList(StopWords)
        
            Tokens=GenerateTokensBySpace(Speech)
        
            Tokens=RemovingContractions(Tokens)
        
            Tokens=RemovingDots(Tokens)
        
            Tokens=LOWERCASECONVERTOR(Tokens)
        
            Tokens=RemovingBraces(Tokens)
        
            Tokens=RemovingHypens(Tokens)
            
            Tokens=FinalFilter(Tokens)
        
            Tokens=RemovingStopWords(Tokens,StopWordList)
        
            Tokens=PorterStemming(Tokens)
        
        
            for i in range(0,len(Tokens)):
                
                if Tokens[i] not in Dictionary:
                    
                    Dictionary.setdefault(Tokens[i],{})
                    
                    Dictionary[Tokens[i]].setdefault(Folder,1)
                
                else:
                    
                    if Folder in Dictionary[Tokens[i]]:
                        
                        Quantity = Dictionary[Tokens[i]][Folder]
                        
                        Quantity+=1
                        
                        Dictionary[Tokens[i]][Folder]=Quantity
                    
                    else:
                        
                        Dictionary[Tokens[i]].setdefault(Folder,1)

            
            
    
#     with open('TermFrequencyMatrix.p', 'wb') as fp:
#         pickle.dump(MatrixForTrainDataset,fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    return Dictionary

In [41]:
def GenerateTotalDatasetFile():
    
    ClassDistribution=ClassDistributionOfTerms()
    
    QtyTerms=0
    
    NewDictionary={}
    
    
    
    
    
    for Term in ClassDistribution:
        
        for Class in ClassDistribution[Term]:
            
            if ClassDistribution[Term][Class]>2:
                
                QtyTerms+=1
        
#                 print(ClassDistribution[Term])
        
                NewDictionary.setdefault(Term)
        
        
#         if len(ClassDistribution[Term])<5:
            
#             QtyTerms+=1
        
#             NewDictionary.setdefault(Term)
        
    SortedKeys=sorted(NewDictionary)
    
    
#     Dictionary=GeneratePostingList('bbcsport/')
    
#     SortedKeys=sorted(Dictionary)
    
    
    Folders=next(os.walk('bbcsport/'))[1]
    TotalDocuments=len(next(os.walk('bbcsport/'+Folders[0]))[2])
    TotalDocuments+=len(next(os.walk('bbcsport/'+Folders[1]))[2])
    TotalDocuments+=len(next(os.walk('bbcsport/'+Folders[2]))[2])
    TotalDocuments+=len(next(os.walk('bbcsport/'+Folders[3]))[2])
    TotalDocuments+=len(next(os.walk('bbcsport/'+Folders[4]))[2])

    
    
    MatrixForTrainDataset=[[0 for i in range(0,len(SortedKeys)+1)]for j in range(0,TotalDocuments)]

    TermIdfMatrix=[[0 for i in range(0,len(SortedKeys)+1)]for j in range(0,TotalDocuments)]
    
    IndexOfFile=0

    Folders=next(os.walk('bbcsport/'))[1]

    for Folder in Folders:
    
        Files=next(os.walk('bbcsport/'+Folder))[2]
    
        for FileName in Files:
        
            File=open("bbcsport/"+Folder+'/'+FileName)
        
            Speech=File.readlines()
        
            # Filtering Data
        
            StopWordList=GeneratingStopWordsList(StopWords)
        
            Tokens=GenerateTokensBySpace(Speech)
        
            Tokens=RemovingContractions(Tokens)
        
            Tokens=RemovingDots(Tokens)
        
            Tokens=LOWERCASECONVERTOR(Tokens)
        
            Tokens=RemovingBraces(Tokens)
        
            Tokens=RemovingHypens(Tokens)
            
            Tokens=FinalFilter(Tokens)
        
            Tokens=RemovingStopWords(Tokens,StopWordList)
        
            Tokens=PorterStemming(Tokens)
        
        
            for i in range(0,len(Tokens)):
                
                try:
            
                    MatrixForTrainDataset[IndexOfFile][SortedKeys.index(Tokens[i])]+=1
            
                    TermIdfMatrix[IndexOfFile][SortedKeys.index(Tokens[i])]=1
                
                except:
                    pass
         

        
            MatrixForTrainDataset[IndexOfFile][len(MatrixForTrainDataset[IndexOfFile])-1]=Folder
        
        
            TermIdfMatrix[IndexOfFile][len(MatrixForTrainDataset[IndexOfFile])-1]=Folder
        
            IndexOfFile+=1
    
    
    for column in range(0,len(TermIdfMatrix[0])-1):
        
        NoOfDocumentForTerm=0
    
        for row in range(0,len(TermIdfMatrix)):
            
            NoOfDocumentForTerm+=TermIdfMatrix[row][column]
            
    
        for row in range(0,len(TermIdfMatrix)):
            
            if MatrixForTrainDataset[row][column]!=0:
                
                MatrixForTrainDataset[row][column]*=(math.log2(TotalDocuments/NoOfDocumentForTerm))
    
    
    with open('Kmeansdataset.p', 'wb') as fp:
        pickle.dump(MatrixForTrainDataset,fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    return MatrixForTrainDataset,SortedKeys

In [42]:
MatrixForTrainDataset,Dictionary = GenerateTotalDatasetFile()

In [43]:
try:
    print("*"*124)
    File = open("Kmeansdataset.p","rb")
    MatrixForTrainDataset = pickle.load(File)
    print("Total data Set File Found")
    print("*"*124)
except:
    
    print("File Not Found")
    
    print("Creating Files .................")
    
    MatrixForTrainDataset,Dictionary = GenerateTotalDatasetFile()
    
    SortedKeys=sorted(Dictionary)
    
    print("Printing Train Data .................")
    
    columns=SortedKeys.copy()
    
    columns.append("Class")
    
    File=pd.DataFrame(MatrixForTrainDataset,columns=columns)
    
    display(File)
    
    print("Files Created !")

****************************************************************************************************************************
Total data Set File Found
****************************************************************************************************************************


In [44]:
def SetDocuments(D1,D2,D3,D4,D5):
    
    NewMatrixForTrainDataset=np.delete(MatrixForTrainDataset,(D1,D2,D3,D4,D5),axis=0)
    
    D1=MatrixForTrainDataset[D1]
    D2=MatrixForTrainDataset[D2]
    D3=MatrixForTrainDataset[D3]
    D4=MatrixForTrainDataset[D4]
    D5=MatrixForTrainDataset[D5]
    
    return MatrixForTrainDataset,NewMatrixForTrainDataset,D1,D2,D3,D4,D5

In [45]:
def DocumentCluster(D1,D2,D3,D4,D5):
    
    DocumentCluster={}
    
    DocumentCluster.setdefault(0,[])
    DocumentCluster.setdefault(1,[])
    DocumentCluster.setdefault(2,[])
    DocumentCluster.setdefault(3,[])
    DocumentCluster.setdefault(4,[])
    
    
    MatrixForTrainDataset,NewMatrixForTrainDataset,D1,D2,D3,D4,D5=SetDocuments(D1,D2,D3,D4,D5)
    
    RandomFiveDocuments=[D1,D2,D3,D4,D5]
    
    print("Calculating Distance ...............................")
    
    for i in range(0,len(NewMatrixForTrainDataset)):
    
        DocTrain=NewMatrixForTrainDataset[i][0:len(NewMatrixForTrainDataset[0])-1]
    
        MiniMumDistance={}
    
        for j in range(0,len(RandomFiveDocuments)):
        
            DocTest=RandomFiveDocuments[j][0:len(RandomFiveDocuments[0])-1]
        
            distance = math.sqrt(sum([(float(a) - float(b)) ** 2 for a, b in zip(DocTest,DocTrain)]))
        
            MiniMumDistance.setdefault(distance,j)
    
        DocumentCluster[[b for a, b in sorted(MiniMumDistance.items())][0]].append(i)
    
    
    
    print("Calculating Mean .................................................")
    
    
    DocumentClusterMean={}
    
    DocumentClusterMean.setdefault(0,[0 for i in range(0,len(MatrixForTrainDataset[0]))])
    
    DocumentClusterMean.setdefault(1,[0 for i in range(0,len(MatrixForTrainDataset[0]))])
    
    DocumentClusterMean.setdefault(2,[0 for i in range(0,len(MatrixForTrainDataset[0]))])
    
    DocumentClusterMean.setdefault(3,[0 for i in range(0,len(MatrixForTrainDataset[0]))])
    
    DocumentClusterMean.setdefault(4,[0 for i in range(0,len(MatrixForTrainDataset[0]))])
    
    
    
    for Cluster in range(0,len(DocumentCluster)):
    
        for Doc in DocumentCluster[Cluster]:
        
            MatrixOne=DocumentClusterMean[Cluster]
        
            MatrixTwo=MatrixForTrainDataset[Doc][0:len(MatrixForTrainDataset[0])-1]
        
            DocumentClusterMean[Cluster]=[x+y for x,y in zip(MatrixOne,MatrixTwo)]
        
        try:
            DocumentClusterMean[Cluster] = [float(x) / float(len(DocumentCluster[Cluster]))  for x in DocumentClusterMean[Cluster] ]
        except:
            pass
    
    DocumentClusterAfterMean={}
    
    DocumentClusterAfterMean.setdefault(0,[])
    
    DocumentClusterAfterMean.setdefault(1,[])
    
    DocumentClusterAfterMean.setdefault(2,[])
    
    DocumentClusterAfterMean.setdefault(3,[])
    
    DocumentClusterAfterMean.setdefault(4,[])
    
    print("Calculating Document Cluster After Calculating Mean ................................")
    
    for i in range(0,len(MatrixForTrainDataset)):
    
        DocTrain=MatrixForTrainDataset[i][0:len(MatrixForTrainDataset[0])-1]
    
        MiniMumDistance={}
    
        for j in range(0,len(DocumentClusterMean)):
        
            DocTest=DocumentClusterMean[j]
        
            distance = math.sqrt(sum([(float(a) - float(b)) ** 2 for a, b in zip(DocTest,DocTrain)]))
        
            MiniMumDistance.setdefault(distance,j)
    
        DocumentClusterAfterMean[[b for a, b in sorted(MiniMumDistance.items())][0]].append(i)
    
    
    print("Converging Cluster...............................")
    
    Iterations=0
    
    while DocumentCluster[0]!=DocumentClusterAfterMean[0] or DocumentCluster[1]!=DocumentClusterAfterMean[1] or DocumentCluster[2]!=DocumentClusterAfterMean[2] or DocumentCluster[3]!=DocumentClusterAfterMean[3] or DocumentCluster[4]!=DocumentClusterAfterMean[4]:
        
        Iterations+=1
        
        print("Iterations : ",Iterations)
    
        DocumentCluster=DocumentClusterAfterMean
    
    
        DocumentClusterMean={}
        
        DocumentClusterMean.setdefault(0,[0 for i in range(0,len(MatrixForTrainDataset[0]))])

        DocumentClusterMean.setdefault(1,[0 for i in range(0,len(MatrixForTrainDataset[0]))])

        DocumentClusterMean.setdefault(2,[0 for i in range(0,len(MatrixForTrainDataset[0]))])
    
        DocumentClusterMean.setdefault(3,[0 for i in range(0,len(MatrixForTrainDataset[0]))])
    
        DocumentClusterMean.setdefault(4,[0 for i in range(0,len(MatrixForTrainDataset[0]))])
    
        for Cluster in range(0,len(DocumentClusterAfterMean)):
        
            for Doc in DocumentClusterAfterMean[Cluster]:
            
                MatrixOne=DocumentClusterMean[Cluster]
            
            
#                 print(Doc,len(MatrixForTrainDataset[0])-1)
                
                MatrixTwo=MatrixForTrainDataset[Doc][0:len(MatrixForTrainDataset[0])-1]
            
                DocumentClusterMean[Cluster]=[x+y for x,y in zip(MatrixOne,MatrixTwo)]
     
            try:
                DocumentClusterMean[Cluster] = [float(x) / float(len(DocumentClusterAfterMean[Cluster]))  for x in DocumentClusterMean[Cluster] ]
            except:
                pass

    
        DocumentClusterAfterMean={}
    
        DocumentClusterAfterMean.setdefault(0,[])
    
        DocumentClusterAfterMean.setdefault(1,[])
    
        DocumentClusterAfterMean.setdefault(2,[])
    
        DocumentClusterAfterMean.setdefault(3,[])
    
        DocumentClusterAfterMean.setdefault(4,[])
    
    
        for i in range(0,len(MatrixForTrainDataset)):
    
            DocTrain=MatrixForTrainDataset[i][0:len(MatrixForTrainDataset[0])-1]
    
            MiniMumDistance={}
    
            for j in range(0,len(DocumentClusterMean)):
        
                DocTest=DocumentClusterMean[j]
        
                distance = math.sqrt(sum([(float(a) - float(b)) ** 2 for a, b in zip(DocTest,DocTrain)]))
        
                MiniMumDistance.setdefault(distance,j)
    
            DocumentClusterAfterMean[[b for a, b in sorted(MiniMumDistance.items())][0]].append(i)
    
    return DocumentCluster,DocumentClusterAfterMean

In [52]:
d,d2=DocumentCluster(200,500,400,600,700)

Calculating Distance ...............................
Calculating Mean .................................................
Calculating Document Cluster After Calculating Mean ................................
Converging Cluster...............................
Iterations :  1
Iterations :  2
Iterations :  3
Iterations :  4
Iterations :  5
Iterations :  6
Iterations :  7
Iterations :  8
Iterations :  9
Iterations :  10
Iterations :  11
Iterations :  12
Iterations :  13
Iterations :  14
Iterations :  15
Iterations :  16
Iterations :  17
Iterations :  18
Iterations :  19
Iterations :  20
Iterations :  21
Iterations :  22
Iterations :  23
Iterations :  24
Iterations :  25


In [53]:
print(len(d[0]),len(d[1]),len(d[2]),len(d[3]),len(d[4]),len(MatrixForTrainDataset))

20 624 2 3 88 737


In [166]:
def GenerateTermFrequencyMatrix():
    
    Dictionary=GeneratePostingList('bbcsport/')
    
    SortedKeys=sorted(Dictionary)
    
    
    Folders=next(os.walk('bbcsport/'))[1]
    
    TotalDocuments=len(next(os.walk('bbcsport/'+Folders[0]))[2])
    TotalDocuments+=len(next(os.walk('bbcsport/'+Folders[1]))[2])
    TotalDocuments+=len(next(os.walk('bbcsport/'+Folders[2]))[2])
    TotalDocuments+=len(next(os.walk('bbcsport/'+Folders[3]))[2])
    TotalDocuments+=len(next(os.walk('bbcsport/'+Folders[4]))[2])

    
    
    MatrixForTrainDataset=[[0 for i in range(0,len(SortedKeys)+1)]for j in range(0,TotalDocuments)]

    TermIdfMatrix=[[0 for i in range(0,len(SortedKeys)+1)]for j in range(0,TotalDocuments)]
    
    IndexOfFile=0

    Folders=next(os.walk('bbcsport/'))[1]

    for Folder in Folders:
    
        Files=next(os.walk('bbcsport/'+Folder))[2]
    
        for FileName in Files:
        
            File=open("bbcsport/"+Folder+'/'+FileName)
        
            Speech=File.readlines()
        
            # Filtering Data
        
            StopWordList=GeneratingStopWordsList(StopWords)
        
            Tokens=GenerateTokensBySpace(Speech)
        
            Tokens=RemovingContractions(Tokens)
        
            Tokens=RemovingDots(Tokens)
        
            Tokens=LOWERCASECONVERTOR(Tokens)
        
            Tokens=RemovingBraces(Tokens)
        
            Tokens=RemovingHypens(Tokens)
            
            Tokens=FinalFilter(Tokens)
        
            Tokens=RemovingStopWords(Tokens,StopWordList)
        
            Tokens=PorterStemming(Tokens)
        
        
            for i in range(0,len(Tokens)):
            
                MatrixForTrainDataset[IndexOfFile][SortedKeys.index(Tokens[i])]+=1
            
                TermIdfMatrix[IndexOfFile][SortedKeys.index(Tokens[i])]=1
            
         

        
            MatrixForTrainDataset[IndexOfFile][len(MatrixForTrainDataset[IndexOfFile])-1]=Folder
        
        
            TermIdfMatrix[IndexOfFile][len(MatrixForTrainDataset[IndexOfFile])-1]=Folder
        
            IndexOfFile+=1
    
#     with open('TermFrequencyMatrix.p', 'wb') as fp:
#         pickle.dump(MatrixForTrainDataset,fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    return MatrixForTrainDataset,Dictionary.keys()



def ClassDistributionOfTerms():
    
    Dictionary={}
    
    Folders=next(os.walk('bbcsport/'))[1]

    for Folder in Folders:
    
        Files=next(os.walk('bbcsport/'+Folder))[2]
    
        for FileName in Files:
        
            File=open("bbcsport/"+Folder+'/'+FileName)
        
            Speech=File.readlines()
        
            # Filtering Data
        
            StopWordList=GeneratingStopWordsList(StopWords)
        
            Tokens=GenerateTokensBySpace(Speech)
        
            Tokens=RemovingContractions(Tokens)
        
            Tokens=RemovingDots(Tokens)
        
            Tokens=LOWERCASECONVERTOR(Tokens)
        
            Tokens=RemovingBraces(Tokens)
        
            Tokens=RemovingHypens(Tokens)
            
            Tokens=FinalFilter(Tokens)
        
            Tokens=RemovingStopWords(Tokens,StopWordList)
        
            Tokens=PorterStemming(Tokens)
        
        
            for i in range(0,len(Tokens)):
                
                if Tokens[i] not in Dictionary:
                    
                    Dictionary.setdefault(Tokens[i],{})
                    
                    Dictionary[Tokens[i]].setdefault(Folder,1)
                
                else:
                    
                    if Folder in Dictionary[Tokens[i]]:
                        
                        Quantity = Dictionary[Tokens[i]][Folder]
                        
                        Quantity+=1
                        
                        Dictionary[Tokens[i]][Folder]=Quantity
                    
                    else:
                        
                        Dictionary[Tokens[i]].setdefault(Folder,1)

            
            
    
#     with open('TermFrequencyMatrix.p', 'wb') as fp:
#         pickle.dump(MatrixForTrainDataset,fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    return Dictionary



In [35]:
ClassDistribution=ClassDistributionOfTerms()

In [36]:
ClassDistribution

{'claxton': {'athletics': 14},
 'hunt': {'athletics': 1, 'cricket': 4, 'football': 4},
 'first': {'athletics': 49,
  'cricket': 219,
  'football': 198,
  'rugby': 175,
  'tennis': 149},
 'major': {'athletics': 18,
  'cricket': 8,
  'football': 34,
  'rugby': 19,
  'tennis': 9},
 'british': {'athletics': 59, 'cricket': 3, 'rugby': 12, 'tennis': 24},
 'hurdler': {'athletics': 5},
 'sarah': {'athletics': 7},
 'confid': {'athletics': 21,
  'cricket': 20,
  'football': 43,
  'rugby': 32,
  'tennis': 25},
 'she': {'athletics': 225, 'football': 7, 'tennis': 128},
 'win': {'athletics': 101,
  'cricket': 68,
  'football': 230,
  'rugby': 193,
  'tennis': 158},
 'medal': {'athletics': 58, 'football': 7, 'rugby': 2},
 'next': {'athletics': 41,
  'cricket': 46,
  'football': 76,
  'rugby': 54,
  'tennis': 40},
 'month': {'athletics': 45,
  'cricket': 21,
  'football': 71,
  'rugby': 22,
  'tennis': 21},
 'european': {'athletics': 100,
  'cricket': 1,
  'football': 40,
  'rugby': 12,
  'tennis': 1}

In [58]:
QtyTerms=0
NewDictionary={}
for Term in ClassDistribution:
    
    for Class in ClassDistribution[Term]:
        
        if ClassDistribution[Term][Class]>=3:
            
            QtyTerms+=1
#                 print(ClassDistribution[Term])
        
            NewDictionary.setdefault(Term)
        
SortedKeys=sorted(NewDictionary)
print(len(SortedKeys),SortedKeys)

4329 ['&', '0', '07', '0830', '1', '10', '100', '100%', '10000m', '1000m', '100m', '106', '108', '109', '10km', '10th', '11', '110', '111', '115', '117', '12', '120', '122', '129', '12th', '13', '131', '132', '135', '136', '139', '13th', '14', '140', '1400', '15', '1500m', '152', '16', '163', '165', '16th', '17', '172', '18', '184', '185', '19', '190', '1948', '196', '1970', '1978', '1985', '1986', '1987', '1988', '1991', '1993', '1994', '1996', '1997', '1998', '1999', '2', '20', '200', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '200m', '202', '208', '21', '22', '23', '24', '25', '25th', '26', '261', '264', '27', '28', '281', '29', '3', '30', '3000m', '304', '30m', '31', '314', '32', '33', '337', '34', '344', '35', '36', '37', '38', '39', '398', '4', '40', '400', '400m', '41', '42', '43', '44', '45', '46', '47', '48', '49', '4x100m', '5', '50', '5000m', '50m', '50th', '51', '52', '53', '54', '55', '56', '568', '57', '58', '59', '5m', '6', '60', '60m', '61',

In [50]:
print(d[1])

[29, 33, 39, 56, 57, 58, 60, 65, 67, 68, 69, 74, 92, 119, 130, 141, 159, 196, 214, 225, 231, 234, 235, 236, 237, 238, 239, 245, 264, 269, 276, 290, 293, 309, 310, 311, 312, 322, 335, 340, 346, 348, 357, 365, 370, 376, 378, 398, 399, 408, 413, 422, 426, 429, 431, 433, 436, 439, 443, 446, 447, 448, 460, 466, 470, 471, 473, 477, 483, 484, 490, 491, 493, 495, 496, 497, 499, 500, 505, 508, 509, 514, 516, 517, 518, 519, 520, 526, 528, 529, 533, 534, 535, 536, 537, 538, 539, 540, 544, 545, 546, 550, 555, 557, 560, 570, 574, 577, 579, 581, 582, 583, 584, 585, 586, 588, 589, 590, 591, 593, 597, 611, 617, 619, 620, 621, 622, 623, 626, 629, 630, 631, 632, 682, 685, 686, 692, 707, 711]


In [33]:
print(len(d[0]),len(d[1]),len(d[2]),len(d[3]),len(d[4]),len(MatrixForTrainDataset[0]))

14 2 27 632 62 2551


In [131]:
AthleticsClassDocuments=[]
TennisClassDocuments=[]
RugbyClassDocuments=[]
FootballClassDocuments=[]
CricketClassDocuments=[]

In [132]:
for i in range(0,len(MatrixForTrainDataset)):
    if MatrixForTrainDataset[i][len(MatrixForTrainDataset[0])-1]=="athletics":
        AthleticsClassDocuments.append(i)
    if MatrixForTrainDataset[i][len(MatrixForTrainDataset[0])-1]=="cricket":
        CricketClassDocuments.append(i)
    if MatrixForTrainDataset[i][len(MatrixForTrainDataset[0])-1]=="rugby":
        RugbyClassDocuments.append(i)
    if MatrixForTrainDataset[i][len(MatrixForTrainDataset[0])-1]=="football":
        FootballClassDocuments.append(i)
    if MatrixForTrainDataset[i][len(MatrixForTrainDataset[0])-1]=="tennis":
        TennisClassDocuments.append(i)

In [146]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

In [165]:
for i in range(0,5):
    print("Cluster :",i)
    print("Athletics Class Length : ",len(intersection(AthleticsClassDocuments,d[i])))
    print("Football Class Length : ",len(intersection(FootballClassDocuments,d[i])))
    print("Rugby Class Length : ",len(intersection(RugbyClassDocuments,d[i])))
    print("\n")

Cluster : 0
Athletics Class Length :  0
Football Class Length :  0
Rugby Class Length :  18


Cluster : 1
Athletics Class Length :  78
Football Class Length :  237
Rugby Class Length :  32


Cluster : 2
Athletics Class Length :  23
Football Class Length :  0
Rugby Class Length :  0


Cluster : 3
Athletics Class Length :  0
Football Class Length :  1
Rugby Class Length :  97


Cluster : 4
Athletics Class Length :  0
Football Class Length :  27
Rugby Class Length :  0


