Skip to content

Commit

Permalink
Very big optimisation of the StateComparator that is now able to comp…
Browse files Browse the repository at this point in the history
…are quickly two States that contains 1 000 000 of files
  • Loading branch information
evrignaud committed Apr 19, 2016
1 parent 71da756 commit eaf290b
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 59 deletions.
147 changes: 88 additions & 59 deletions src/main/java/org/fim/internal/StateComparator.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@
import static org.fim.model.HashMode.dontHash;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicBoolean;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import org.apache.commons.lang3.SystemUtils;
import org.fim.model.CompareResult;
import org.fim.model.Context;
Expand All @@ -49,7 +53,7 @@ public class StateComparator
private State lastState;
private State currentState;

private List<FileState> previousFileStates;
private ListMultimap<FileHash, FileState> previousFileStates;
private List<FileState> notFoundInCurrentFileState;
private List<FileState> addedOrModified;
private int notModifiedCount;
Expand Down Expand Up @@ -77,7 +81,7 @@ private void init()

result = new CompareResult(context, lastState);

previousFileStates = new ArrayList<>();
previousFileStates = ArrayListMultimap.create();
notFoundInCurrentFileState = new ArrayList<>();
addedOrModified = new ArrayList<>();
}
Expand Down Expand Up @@ -159,22 +163,27 @@ private void searchForAddedOrModified()
logDebug("---------------------------------------------------------------------",
"lastState", lastState.getFileStates(), "currentState", currentState.getFileStates());

previousFileStates.addAll(lastState.getFileStates());
for (FileState fileState : lastState.getFileStates())
{
previousFileStates.put(fileState.getFileHash(), fileState);
}
}
else
{
logDebug("---------------------------------------------------------------------",
"currentState", currentState.getFileStates());
}

resetNewHash(previousFileStates);
resetNewHash(previousFileStates.values());

notFoundInCurrentFileState.addAll(previousFileStates);
Map<Long, FileState> previousFileStatesHashCodeMap = buildHashCodeMap(previousFileStates.values());

notModifiedCount = 0;
for (FileState fileState : currentState.getFileStates())
List<FileState> fileStates = currentState.getFileStates();
for (int index = 0, fileStatesSize = fileStates.size(); index < fileStatesSize; index++)
{
if (notFoundInCurrentFileState.remove(fileState))
FileState fileState = fileStates.get(index);
if (previousFileStatesHashCodeMap.remove(fileState.longHashCode()) != null)
{
notModifiedCount++;
}
Expand All @@ -183,104 +192,146 @@ private void searchForAddedOrModified()
addedOrModified.add(fileState);
}
}
notFoundInCurrentFileState.addAll(previousFileStatesHashCodeMap.values());

logDebug("Built addedOrModified", "notFoundInCurrentFileState", notFoundInCurrentFileState, "addedOrModified", addedOrModified);
}

private void searchForSameFileNames()
{
Map<String, FileState> notFoundInCurrentFileStateNamesMap = buildFileNamesMap(notFoundInCurrentFileState);

boolean managed;
FileState previousFileState;
Iterator<FileState> iterator = addedOrModified.iterator();
while (iterator.hasNext())
List<FileState> newAddedOrModified = new ArrayList<>();
for (FileState fileState : addedOrModified)
{
FileState fileState = iterator.next();
if ((previousFileState = findFileWithSameFileName(fileState, notFoundInCurrentFileState)) != null)
managed = false;
if ((previousFileState = findFileWithSameFileName(fileState, notFoundInCurrentFileStateNamesMap)) != null)
{
notFoundInCurrentFileState.remove(previousFileState);
notFoundInCurrentFileStateNamesMap.remove(previousFileState.getFileName());

if (result.isSearchForHardwareCorruption())
{
if (false == previousFileState.getFileHash().equals(fileState.getFileHash()) && previousFileState.getFileTime().equals(fileState.getFileTime()))
if (!previousFileState.getFileHash().equals(fileState.getFileHash()) && previousFileState.getFileTime().equals(fileState.getFileTime()))
{
result.getCorrupted().add(new Difference(previousFileState, fileState));
fileState.setModification(Modification.corrupted);
iterator.remove();
managed = true;
}
}
else
{
if (previousFileState.getFileHash().equals(fileState.getFileHash()))
{
if (false == previousFileState.getFileTime().equals(fileState.getFileTime()))
if (!previousFileState.getFileTime().equals(fileState.getFileTime()))
{
result.getDateModified().add(new Difference(previousFileState, fileState));
fileState.setModification(Modification.dateModified);
iterator.remove();
managed = true;
}
else if (false == Objects.equals(previousFileState.getFileAttributes(), fileState.getFileAttributes()))
else if (!Objects.equals(previousFileState.getFileAttributes(), fileState.getFileAttributes()))
{
result.getAttributesModified().add(new Difference(previousFileState, fileState));
fileState.setModification(Modification.attributesModified);
iterator.remove();
managed = true;
}
}
else
{
result.getContentModified().add(new Difference(previousFileState, fileState));
fileState.setModification(Modification.contentModified);
iterator.remove();
managed = true;

// File has been modified so set the new hash for accurate duplicate detection
previousFileState.setNewFileHash(new FileHash(fileState.getFileHash()));
}
}
}

if (!managed)
{
newAddedOrModified.add(fileState);
}
}
addedOrModified = newAddedOrModified;
notFoundInCurrentFileState = new ArrayList<>(notFoundInCurrentFileStateNamesMap.values());

logDebug("Search done for same FileNames", "notFoundInCurrentFileState", notFoundInCurrentFileState, "addedOrModified", addedOrModified);
}

private void searchForDifferences()
{
Map<Long, FileState> notFoundInCurrentFileStateHashCodeMap = buildHashCodeMap(notFoundInCurrentFileState);

List<FileState> samePreviousHash;
Iterator<FileState> iterator = addedOrModified.iterator();
while (iterator.hasNext())
for (FileState fileState : addedOrModified)
{
FileState fileState = iterator.next();
if ((context.getHashMode() != dontHash) &&
((samePreviousHash = findFilesWithSameHash(fileState, previousFileStates)).size() > 0))
{
FileState originalFileState = samePreviousHash.get(0);
if (notFoundInCurrentFileState.contains(originalFileState))
long originalFileStateHashCode = originalFileState.longHashCode();
if (notFoundInCurrentFileStateHashCodeMap.containsKey(originalFileStateHashCode))
{
result.getRenamed().add(new Difference(originalFileState, fileState));
fileState.setModification(Modification.renamed);
iterator.remove();
}
else
{
if (contentChanged(originalFileState))
{
result.getCopied().add(new Difference(originalFileState, fileState));
fileState.setModification(Modification.copied);
iterator.remove();
}
else
{
result.getDuplicated().add(new Difference(originalFileState, fileState));
fileState.setModification(Modification.duplicated);
iterator.remove();
}
}
notFoundInCurrentFileState.remove(originalFileState);
notFoundInCurrentFileStateHashCodeMap.remove(originalFileStateHashCode);
}
else
{
result.getAdded().add(new Difference(null, fileState));
fileState.setModification(Modification.added);
iterator.remove();
}
}
addedOrModified.clear();
notFoundInCurrentFileState = new ArrayList<>(notFoundInCurrentFileStateHashCodeMap.values());
}

private Map<String, FileState> buildFileNamesMap(Collection<FileState> fileStates)
{
Map<String, FileState> fileNamesMap = new HashMap<>();
for (FileState fileState : fileStates)
{
fileNamesMap.put(fileState.getFileName(), fileState);
}

// Check that no entry is duplicated
if (fileStates.size() != fileNamesMap.size())
{
throw new IllegalStateException(String.format("Duplicated entries: Size=%d, MapSize=%d", fileStates.size(), fileNamesMap.size()));
}
return fileNamesMap;
}

private Map<Long, FileState> buildHashCodeMap(Collection<FileState> fileStates)
{
Map<Long, FileState> hashCodeMap = new HashMap<>();
for (FileState fileState : fileStates)
{
hashCodeMap.put(fileState.longHashCode(), fileState);
}

// Check that no entry is duplicated
if (fileStates.size() != hashCodeMap.size())
{
throw new IllegalStateException(String.format("Duplicated entries: Size=%d, MapSize=%d", fileStates.size(), hashCodeMap.size()));
}
return hashCodeMap;
}

private void checkAllFilesManagedCorrectly()
Expand All @@ -299,12 +350,9 @@ private void checkAllFilesManagedCorrectly()

private void searchForDeleted()
{
notFoundInCurrentFileState.stream().
filter(fileState -> !isFileIgnored(fileState)).
forEach(fileState ->
{
result.getDeleted().add(new Difference(null, fileState));
});
notFoundInCurrentFileState.stream()
.filter(fileState -> !isFileIgnored(fileState))
.forEach(fileState -> result.getDeleted().add(new Difference(null, fileState)));
}

private boolean isFileIgnored(FileState fileState)
Expand Down Expand Up @@ -372,40 +420,21 @@ private String fileStatesToString(String message, List<FileState> fileStates)
return builder.toString();
}

private void resetNewHash(List<FileState> fileStates)
private void resetNewHash(Collection<FileState> fileStates)
{
for (FileState fileState : fileStates)
{
fileState.resetNewHash();
}
}

private FileState findFileWithSameFileName(FileState search, List<FileState> fileStates)
private FileState findFileWithSameFileName(FileState search, Map<String, FileState> fileStates)
{
int index = 0;
for (FileState fileState : fileStates)
{
if (fileState.getFileName().equals(search.getFileName()))
{
return fileStates.get(index);
}
index++;
}

return null;
return fileStates.get(search.getFileName());
}

private List<FileState> findFilesWithSameHash(FileState search, List<FileState> fileStates)
private List<FileState> findFilesWithSameHash(FileState search, ListMultimap<FileHash, FileState> fileStates)
{
List<FileState> sameHash = new ArrayList<>();
for (FileState fileState : fileStates)
{
if (fileState.getFileHash().equals(search.getFileHash()))
{
sameHash.add(fileState);
}
}

return sameHash;
return fileStates.get(search.getFileHash());
}
}
10 changes: 10 additions & 0 deletions src/main/java/org/fim/model/FileState.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import com.google.common.base.Charsets;
import com.google.common.base.MoreObjects;
import com.google.common.hash.Hasher;
import org.fim.util.ObjectsUtil;

public class FileState implements Hashable
{
Expand Down Expand Up @@ -166,6 +167,15 @@ public int hashCode()
return Objects.hash(fileName, fileLength, fileTime, fileHash, fileAttributes);
}

/**
* Returns a hash code value for the object.
* A long is used to avoid hashCode collisions when we have a huge number of FileStates.
*/
public long longHashCode()
{
return ObjectsUtil.longHash(fileName, fileLength, fileTime, fileHash, fileAttributes);
}

@Override
public String toString()
{
Expand Down
37 changes: 37 additions & 0 deletions src/main/java/org/fim/util/ObjectsUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* This file is part of Fim - File Integrity Manager
*
* Copyright (C) 2015 Etienne Vrignaud
*
* Fim is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Fim is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Fim. If not, see <http://www.gnu.org/licenses/>.
*/
package org.fim.util;

public class ObjectsUtil
{
public static long longHash(Object... values)
{
if (values == null)
{
return 0;
}

long result = 1;
for (Object value : values)
{
result = 31 * result + (value == null ? 0 : value.hashCode());
}
return result;
}
}
Loading

0 comments on commit eaf290b

Please sign in to comment.